dtk

8f7de847 · yuguo960516yuguo · f262efc9 · 8f7de847 · 8f7de847 · 8f7de847
Commit 8f7de847 authored Apr 25, 2023 by yuguo960516yuguo
20 changed files
--- a/oneflow/core/ep/rocm/primitive/binary_functor.hip.h
+++ b/oneflow/core/ep/rocm/primitive/binary_functor.hip.h
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-#include "oneflow/core/ep/common/primitive/binary_functor.h"
-
-namespace oneflow {
-namespace ep {
-namespace primitive {
-namespace broadcast_elementwise_binary {
-
-template<typename Src, typename Dst>
-struct BinaryFunctor<DeviceType::kCUDA, BinaryOp::kPow, Src, Dst> {
-  OF_DEVICE_FUNC BinaryFunctor(Scalar attr0, Scalar attr1) {}
-
-  OF_DEVICE_FUNC Dst operator()(Src src0, Src src1) const { return pow(src0, src1); }
-};
-
-template<>
-struct BinaryFunctor<DeviceType::kCUDA, BinaryOp::kPow, bool, bool> {
-  OF_DEVICE_FUNC BinaryFunctor(Scalar attr0, Scalar attr1) {}
-
-  OF_DEVICE_FUNC bool operator()(bool src0, bool src1) const {
-    return static_cast<bool>(pow(static_cast<double>(src0), static_cast<double>(src1)));
-  }
-};
-
-template<>
-struct BinaryFunctor<DeviceType::kCUDA, BinaryOp::kPow, half, half> {
-  OF_DEVICE_FUNC BinaryFunctor(Scalar attr0, Scalar attr1) {}
-
-  OF_DEVICE_FUNC half operator()(half src0, half src1) const {
-    return static_cast<half>(pow(static_cast<float>(src0), static_cast<float>(src1)));
-  }
-};
-
-template<typename Src, typename Dst>
-struct BinaryFunctor<DeviceType::kCUDA, BinaryOp::kGeluBackwardWithDyX, Src, Dst> {
-  OF_DEVICE_FUNC BinaryFunctor(Scalar attr0, Scalar attr1) {
-#if defined(__CUDA_ARCH__)
-    coef = sqrt(static_cast<Src>(2.0) / acos(static_cast<Src>(-1.0)));
-#elif defined(__HIP_DEVICE_COMPILE__)
-    coef = sqrt(static_cast<Src>(2.0) / acos(static_cast<Src>(-1.0)));
-#else
-    coef = std::sqrt(static_cast<Src>(2.0) / std::acos(static_cast<Src>(-1.0)));
-#endif
-  }
-
-  OF_DEVICE_FUNC Dst operator()(Src dy, Src x) const {
-    return static_cast<Src>(0.5)
-           * (static_cast<Src>(1.0) + erf(static_cast<Src>(M_SQRT1_2) * x)
-              + x * coef * exp(static_cast<Src>(-0.5) * x * x))
-           * dy;
-  }
-  Src coef;
-};
-
-template<typename Src, typename Dst>
-struct BinaryFunctor<DeviceType::kCUDA, BinaryOp::kTanhBackwardWithDyX, Src, Dst> {
-  OF_DEVICE_FUNC BinaryFunctor(Scalar attr0, Scalar attr1) {}
-
-  OF_DEVICE_FUNC Dst operator()(Src dy, Src x) const {
-    Src tanh_val = tanh(x);
-    return static_cast<Dst>(dy * (static_cast<Src>(1.0) - tanh_val * tanh_val));
-  }
-};
-
-// /*********nv_bfloat16_kernel*******/
-
-// #if CUDA_VERSION >= 11000
-
-// template<>
-// struct BinaryFunctor<DeviceType::kCUDA, BinaryOp::kPow, nv_bfloat16, nv_bfloat16> {
-//   OF_DEVICE_FUNC BinaryFunctor(Scalar attr0, Scalar attr1) {}
-
-//   OF_DEVICE_FUNC nv_bfloat16 operator()(nv_bfloat16 src0, nv_bfloat16 src1) const {
-//     return static_cast<nv_bfloat16>(pow(static_cast<float>(src0), static_cast<float>(src1)));
-//   }
-// };
-
-// #define SPECIALIZATION_PSEUDO_BFLOAT16_BINARY_FUNCTOR(op)                                     \
-//   template<>                                                                                  \
-//   struct BinaryFunctor<DeviceType::kCUDA, op, nv_bfloat16, nv_bfloat16> {                     \
-//     OF_DEVICE_FUNC BinaryFunctor(Scalar attr0, Scalar attr1) : float_functor(attr0, attr1) {} \
-//                                                                                               \
-//     BinaryFunctor<DeviceType::kCUDA, op, float, float> float_functor;                         \
-//     OF_DEVICE_FUNC nv_bfloat16 operator()(nv_bfloat16 src0, nv_bfloat16 src1) const {         \
-//       return __float2bfloat16(float_functor(__bfloat162float(src0), __bfloat162float(src1))); \
-//     }                                                                                         \
-//   };
-
-// SPECIALIZATION_PSEUDO_BFLOAT16_BINARY_FUNCTOR(BinaryOp::kEluBackwardWithDyX);
-// SPECIALIZATION_PSEUDO_BFLOAT16_BINARY_FUNCTOR(BinaryOp::kCeluBackwardWithDyX);
-// SPECIALIZATION_PSEUDO_BFLOAT16_BINARY_FUNCTOR(BinaryOp::kGeluBackwardWithDyX);
-// SPECIALIZATION_PSEUDO_BFLOAT16_BINARY_FUNCTOR(BinaryOp::kHardswishBackwardWithDyX);
-// SPECIALIZATION_PSEUDO_BFLOAT16_BINARY_FUNCTOR(BinaryOp::kHardsigmoidBackwardWithDyX);
-// SPECIALIZATION_PSEUDO_BFLOAT16_BINARY_FUNCTOR(BinaryOp::kHardshrinkBackwardWithDyY);
-// SPECIALIZATION_PSEUDO_BFLOAT16_BINARY_FUNCTOR(BinaryOp::kHardtanhBackwardWithDyY);
-// SPECIALIZATION_PSEUDO_BFLOAT16_BINARY_FUNCTOR(BinaryOp::kLeakyReluBackwardWithDyX);
-// SPECIALIZATION_PSEUDO_BFLOAT16_BINARY_FUNCTOR(BinaryOp::kMishBackwardWithDyX);
-// SPECIALIZATION_PSEUDO_BFLOAT16_BINARY_FUNCTOR(BinaryOp::kSeluBackwardWithDyX);
-// SPECIALIZATION_PSEUDO_BFLOAT16_BINARY_FUNCTOR(BinaryOp::kSiluBackwardWithDyX);
-// SPECIALIZATION_PSEUDO_BFLOAT16_BINARY_FUNCTOR(BinaryOp::kSoftsignBackwardWithDyX);
-// SPECIALIZATION_PSEUDO_BFLOAT16_BINARY_FUNCTOR(BinaryOp::kSoftplusBackwardWithDyX);
-// SPECIALIZATION_PSEUDO_BFLOAT16_BINARY_FUNCTOR(BinaryOp::kSoftshrinkBackwardWithDyY);
-// SPECIALIZATION_PSEUDO_BFLOAT16_BINARY_FUNCTOR(BinaryOp::kTanhBackwardWithDyX);
-// SPECIALIZATION_PSEUDO_BFLOAT16_BINARY_FUNCTOR(BinaryOp::kThresholdBackwardWithDyX);
-
-// #endif  // CUDA_VERSION >= 11000
-
-#define SPECIALIZATION_PSEUDO_HALF_BINARY_FUNCTOR(op)                                         \
-  template<>                                                                                  \
-  struct BinaryFunctor<DeviceType::kCUDA, op, half, half> {                                   \
-    OF_DEVICE_FUNC BinaryFunctor(Scalar attr0, Scalar attr1) : float_functor(attr0, attr1) {} \
-                                                                                              \
-    BinaryFunctor<DeviceType::kCUDA, op, float, float> float_functor;                         \
-    OF_DEVICE_FUNC half operator()(half src0, half src1) const {                              \
-      return __float2half(float_functor(__half2float(src0), __half2float(src1)));             \
-    }                                                                                         \
-  };
-
-SPECIALIZATION_PSEUDO_HALF_BINARY_FUNCTOR(BinaryOp::kEluBackwardWithDyX);
-SPECIALIZATION_PSEUDO_HALF_BINARY_FUNCTOR(BinaryOp::kCeluBackwardWithDyX);
-SPECIALIZATION_PSEUDO_HALF_BINARY_FUNCTOR(BinaryOp::kGeluBackwardWithDyX);
-SPECIALIZATION_PSEUDO_HALF_BINARY_FUNCTOR(BinaryOp::kHardswishBackwardWithDyX);
-SPECIALIZATION_PSEUDO_HALF_BINARY_FUNCTOR(BinaryOp::kHardshrinkBackwardWithDyY);
-SPECIALIZATION_PSEUDO_HALF_BINARY_FUNCTOR(BinaryOp::kMishBackwardWithDyX);
-SPECIALIZATION_PSEUDO_HALF_BINARY_FUNCTOR(BinaryOp::kSiluBackwardWithDyX);
-SPECIALIZATION_PSEUDO_HALF_BINARY_FUNCTOR(BinaryOp::kSeluBackwardWithDyX);
-SPECIALIZATION_PSEUDO_HALF_BINARY_FUNCTOR(BinaryOp::kSoftplusBackwardWithDyX);
-SPECIALIZATION_PSEUDO_HALF_BINARY_FUNCTOR(BinaryOp::kSoftsignBackwardWithDyX);
-SPECIALIZATION_PSEUDO_HALF_BINARY_FUNCTOR(BinaryOp::kSoftshrinkBackwardWithDyY);
-SPECIALIZATION_PSEUDO_HALF_BINARY_FUNCTOR(BinaryOp::kThresholdBackwardWithDyX);
-SPECIALIZATION_PSEUDO_HALF_BINARY_FUNCTOR(BinaryOp::kTanhBackwardWithDyX);
-
-}  // namespace broadcast_elementwise_binary
-}  // namespace primitive
-}  // namespace ep
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+#include "oneflow/core/ep/common/primitive/binary_functor.h"
+
+namespace oneflow {
+namespace ep {
+namespace primitive {
+namespace broadcast_elementwise_binary {
+
+template<typename Src, typename Dst>
+struct BinaryFunctor<DeviceType::kCUDA, BinaryOp::kPow, Src, Dst> {
+  OF_DEVICE_FUNC BinaryFunctor(Scalar attr0, Scalar attr1) {}
+
+  OF_DEVICE_FUNC Dst operator()(Src src0, Src src1) const { return pow(src0, src1); }
+};
+
+template<>
+struct BinaryFunctor<DeviceType::kCUDA, BinaryOp::kPow, bool, bool> {
+  OF_DEVICE_FUNC BinaryFunctor(Scalar attr0, Scalar attr1) {}
+
+  OF_DEVICE_FUNC bool operator()(bool src0, bool src1) const {
+    return static_cast<bool>(pow(static_cast<double>(src0), static_cast<double>(src1)));
+  }
+};
+
+template<>
+struct BinaryFunctor<DeviceType::kCUDA, BinaryOp::kPow, half, half> {
+  OF_DEVICE_FUNC BinaryFunctor(Scalar attr0, Scalar attr1) {}
+
+  OF_DEVICE_FUNC half operator()(half src0, half src1) const {
+    return static_cast<half>(pow(static_cast<float>(src0), static_cast<float>(src1)));
+  }
+};
+
+template<typename Src, typename Dst>
+struct BinaryFunctor<DeviceType::kCUDA, BinaryOp::kGeluBackwardWithDyX, Src, Dst> {
+  OF_DEVICE_FUNC BinaryFunctor(Scalar attr0, Scalar attr1) {
+#if defined(__CUDA_ARCH__)
+    coef = sqrt(static_cast<Src>(2.0) / acos(static_cast<Src>(-1.0)));
+#elif defined(__HIP_DEVICE_COMPILE__)
+    coef = sqrt(static_cast<Src>(2.0) / acos(static_cast<Src>(-1.0)));
+#else
+    coef = std::sqrt(static_cast<Src>(2.0) / std::acos(static_cast<Src>(-1.0)));
+#endif
+  }
+
+  OF_DEVICE_FUNC Dst operator()(Src dy, Src x) const {
+    return static_cast<Src>(0.5)
+           * (static_cast<Src>(1.0) + erf(static_cast<Src>(M_SQRT1_2) * x)
+              + x * coef * exp(static_cast<Src>(-0.5) * x * x))
+           * dy;
+  }
+  Src coef;
+};
+
+template<typename Src, typename Dst>
+struct BinaryFunctor<DeviceType::kCUDA, BinaryOp::kTanhBackwardWithDyX, Src, Dst> {
+  OF_DEVICE_FUNC BinaryFunctor(Scalar attr0, Scalar attr1) {}
+
+  OF_DEVICE_FUNC Dst operator()(Src dy, Src x) const {
+    Src tanh_val = tanh(x);
+    return static_cast<Dst>(dy * (static_cast<Src>(1.0) - tanh_val * tanh_val));
+  }
+};
+
+// /*********nv_bfloat16_kernel*******/
+
+// #if CUDA_VERSION >= 11000
+
+// template<>
+// struct BinaryFunctor<DeviceType::kCUDA, BinaryOp::kPow, nv_bfloat16, nv_bfloat16> {
+//   OF_DEVICE_FUNC BinaryFunctor(Scalar attr0, Scalar attr1) {}
+
+//   OF_DEVICE_FUNC nv_bfloat16 operator()(nv_bfloat16 src0, nv_bfloat16 src1) const {
+//     return static_cast<nv_bfloat16>(pow(static_cast<float>(src0), static_cast<float>(src1)));
+//   }
+// };
+
+// #define SPECIALIZATION_PSEUDO_BFLOAT16_BINARY_FUNCTOR(op)                                     \
+//   template<>                                                                                  \
+//   struct BinaryFunctor<DeviceType::kCUDA, op, nv_bfloat16, nv_bfloat16> {                     \
+//     OF_DEVICE_FUNC BinaryFunctor(Scalar attr0, Scalar attr1) : float_functor(attr0, attr1) {} \
+//                                                                                               \
+//     BinaryFunctor<DeviceType::kCUDA, op, float, float> float_functor;                         \
+//     OF_DEVICE_FUNC nv_bfloat16 operator()(nv_bfloat16 src0, nv_bfloat16 src1) const {         \
+//       return __float2bfloat16(float_functor(__bfloat162float(src0), __bfloat162float(src1))); \
+//     }                                                                                         \
+//   };
+
+// SPECIALIZATION_PSEUDO_BFLOAT16_BINARY_FUNCTOR(BinaryOp::kEluBackwardWithDyX);
+// SPECIALIZATION_PSEUDO_BFLOAT16_BINARY_FUNCTOR(BinaryOp::kCeluBackwardWithDyX);
+// SPECIALIZATION_PSEUDO_BFLOAT16_BINARY_FUNCTOR(BinaryOp::kGeluBackwardWithDyX);
+// SPECIALIZATION_PSEUDO_BFLOAT16_BINARY_FUNCTOR(BinaryOp::kHardswishBackwardWithDyX);
+// SPECIALIZATION_PSEUDO_BFLOAT16_BINARY_FUNCTOR(BinaryOp::kHardsigmoidBackwardWithDyX);
+// SPECIALIZATION_PSEUDO_BFLOAT16_BINARY_FUNCTOR(BinaryOp::kHardshrinkBackwardWithDyY);
+// SPECIALIZATION_PSEUDO_BFLOAT16_BINARY_FUNCTOR(BinaryOp::kHardtanhBackwardWithDyY);
+// SPECIALIZATION_PSEUDO_BFLOAT16_BINARY_FUNCTOR(BinaryOp::kLeakyReluBackwardWithDyX);
+// SPECIALIZATION_PSEUDO_BFLOAT16_BINARY_FUNCTOR(BinaryOp::kMishBackwardWithDyX);
+// SPECIALIZATION_PSEUDO_BFLOAT16_BINARY_FUNCTOR(BinaryOp::kSeluBackwardWithDyX);
+// SPECIALIZATION_PSEUDO_BFLOAT16_BINARY_FUNCTOR(BinaryOp::kSiluBackwardWithDyX);
+// SPECIALIZATION_PSEUDO_BFLOAT16_BINARY_FUNCTOR(BinaryOp::kSoftsignBackwardWithDyX);
+// SPECIALIZATION_PSEUDO_BFLOAT16_BINARY_FUNCTOR(BinaryOp::kSoftplusBackwardWithDyX);
+// SPECIALIZATION_PSEUDO_BFLOAT16_BINARY_FUNCTOR(BinaryOp::kSoftshrinkBackwardWithDyY);
+// SPECIALIZATION_PSEUDO_BFLOAT16_BINARY_FUNCTOR(BinaryOp::kTanhBackwardWithDyX);
+// SPECIALIZATION_PSEUDO_BFLOAT16_BINARY_FUNCTOR(BinaryOp::kThresholdBackwardWithDyX);
+
+// #endif  // CUDA_VERSION >= 11000
+
+#define SPECIALIZATION_PSEUDO_HALF_BINARY_FUNCTOR(op)                                         \
+  template<>                                                                                  \
+  struct BinaryFunctor<DeviceType::kCUDA, op, half, half> {                                   \
+    OF_DEVICE_FUNC BinaryFunctor(Scalar attr0, Scalar attr1) : float_functor(attr0, attr1) {} \
+                                                                                              \
+    BinaryFunctor<DeviceType::kCUDA, op, float, float> float_functor;                         \
+    OF_DEVICE_FUNC half operator()(half src0, half src1) const {                              \
+      return __float2half(float_functor(__half2float(src0), __half2float(src1)));             \
+    }                                                                                         \
+  };
+
+SPECIALIZATION_PSEUDO_HALF_BINARY_FUNCTOR(BinaryOp::kEluBackwardWithDyX);
+SPECIALIZATION_PSEUDO_HALF_BINARY_FUNCTOR(BinaryOp::kCeluBackwardWithDyX);
+SPECIALIZATION_PSEUDO_HALF_BINARY_FUNCTOR(BinaryOp::kGeluBackwardWithDyX);
+SPECIALIZATION_PSEUDO_HALF_BINARY_FUNCTOR(BinaryOp::kHardswishBackwardWithDyX);
+SPECIALIZATION_PSEUDO_HALF_BINARY_FUNCTOR(BinaryOp::kHardshrinkBackwardWithDyY);
+SPECIALIZATION_PSEUDO_HALF_BINARY_FUNCTOR(BinaryOp::kMishBackwardWithDyX);
+SPECIALIZATION_PSEUDO_HALF_BINARY_FUNCTOR(BinaryOp::kSiluBackwardWithDyX);
+SPECIALIZATION_PSEUDO_HALF_BINARY_FUNCTOR(BinaryOp::kSeluBackwardWithDyX);
+SPECIALIZATION_PSEUDO_HALF_BINARY_FUNCTOR(BinaryOp::kSoftplusBackwardWithDyX);
+SPECIALIZATION_PSEUDO_HALF_BINARY_FUNCTOR(BinaryOp::kSoftsignBackwardWithDyX);
+SPECIALIZATION_PSEUDO_HALF_BINARY_FUNCTOR(BinaryOp::kSoftshrinkBackwardWithDyY);
+SPECIALIZATION_PSEUDO_HALF_BINARY_FUNCTOR(BinaryOp::kThresholdBackwardWithDyX);
+SPECIALIZATION_PSEUDO_HALF_BINARY_FUNCTOR(BinaryOp::kTanhBackwardWithDyX);
+
+}  // namespace broadcast_elementwise_binary
+}  // namespace primitive
+}  // namespace ep
 }  // namespace oneflow
\ No newline at end of file
--- a/oneflow/core/ep/rocm/primitive/broadcast_elementwise_binary.hip.cpp
+++ b/oneflow/core/ep/rocm/primitive/broadcast_elementwise_binary.hip.cpp
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#include "oneflow/core/ep/include/primitive/broadcast_elementwise_binary.h"
-#include "oneflow/core/ep/common/primitive/broadcast_elementwise_binary.h"
-#include "oneflow/core/ep/rocm/primitive/type_seq.h"
-#include "oneflow/core/ep/rocm/cuda_stream.h"
-#include "oneflow/core/hip/elementwise.hip.h"
-#include "oneflow/core/ep/rocm/primitive/binary_functor.hip.h"
-
-namespace oneflow {
-
-namespace ep {
-namespace primitive {
-namespace broadcast_elementwise_binary {
-
-template<BinaryOp binary_op, typename Src, typename Dst>
-std::unique_ptr<BroadcastElementwiseBinary> NewBroadcastElementwiseBinary(Scalar attr0,
-                                                                          Scalar attr1);
-
-namespace {
-
-class BroadcastElementwiseBinaryFactoryImpl : public BroadcastElementwiseBinaryFactory {
- public:
-  OF_DISALLOW_COPY_AND_MOVE(BroadcastElementwiseBinaryFactoryImpl);
-  BroadcastElementwiseBinaryFactoryImpl() = default;
-  ~BroadcastElementwiseBinaryFactoryImpl() override = default;
-
-  std::unique_ptr<BroadcastElementwiseBinary> New(BinaryOp op, DataType src_type, DataType dst_type,
-                                                  size_t max_num_dims) override {
-    return New(op, src_type, dst_type, max_num_dims, Scalar(), Scalar());
-  }
-
-  std::unique_ptr<BroadcastElementwiseBinary> New(BinaryOp op, DataType src_type, DataType dst_type,
-                                                  size_t max_num_dims, Scalar attr0) override {
-    return New(op, src_type, dst_type, max_num_dims, attr0, Scalar());
-  }
-
-  std::unique_ptr<BroadcastElementwiseBinary> New(BinaryOp binary_op, DataType src_type,
-                                                  DataType dst_type, size_t max_num_dims,
-                                                  Scalar attr0, Scalar attr1) override {
-    if (max_num_dims > kMaxNumDims) { return nullptr; }
-#define MAKE_NEW_BROADCAST_ELEMENTWISE_BINARY_MATH_ENTRY(binary_op, data_type_pair) \
-  {std::make_tuple(binary_op, OF_PP_PAIR_SECOND(data_type_pair),                    \
-                   OF_PP_PAIR_SECOND(data_type_pair)),                              \
-   NewBroadcastElementwiseBinary<binary_op, OF_PP_PAIR_FIRST(data_type_pair),       \
-                                 OF_PP_PAIR_FIRST(data_type_pair)>},
-
-#define MAKE_NEW_BROADCAST_ELEMENTWISE_BINARY_COMPARASION_AND_LOGICAL_ENTRY(      \
-    binary_op, src_data_type_pair, dst_data_type_pair)                            \
-  {std::make_tuple(binary_op, OF_PP_PAIR_SECOND(src_data_type_pair),              \
-                   OF_PP_PAIR_SECOND(dst_data_type_pair)),                        \
-   NewBroadcastElementwiseBinary<binary_op, OF_PP_PAIR_FIRST(src_data_type_pair), \
-                                 OF_PP_PAIR_FIRST(dst_data_type_pair)>},
-
-#define MAKE_NEW_BROADCAST_ELEMENTWISE_BINARY_ACTIVATION_GRAD_ENTRY(binary_op, data_type_pair) \
-  {std::make_tuple(binary_op, OF_PP_PAIR_SECOND(data_type_pair),                               \
-                   OF_PP_PAIR_SECOND(data_type_pair)),                                         \
-   NewBroadcastElementwiseBinary<binary_op, OF_PP_PAIR_FIRST(data_type_pair),                  \
-                                 OF_PP_PAIR_FIRST(data_type_pair)>},
-
-    static const std::map<
-        std::tuple<BinaryOp, DataType, DataType>,
-        std::function<std::unique_ptr<BroadcastElementwiseBinary>(Scalar, Scalar)>>
-        new_broadcast_elementwise_binary_handle{
-            OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(MAKE_NEW_BROADCAST_ELEMENTWISE_BINARY_MATH_ENTRY,
-                                             BINARY_MATH_OP_SEQ, CUDA_PRIMITIVE_ALL_TYPE_SEQ)
-
-                OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(
-                    MAKE_NEW_BROADCAST_ELEMENTWISE_BINARY_COMPARASION_AND_LOGICAL_ENTRY,
-                    BINARY_COMPARISION_OP_SEQ BINARY_LOGICAL_OP_SEQ, CUDA_PRIMITIVE_ALL_TYPE_SEQ,
-                    CUDA_PRIMITIVE_BOOL_TYPE_SEQ)
-
-                    OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(
-                        MAKE_NEW_BROADCAST_ELEMENTWISE_BINARY_ACTIVATION_GRAD_ENTRY,
-                        BINARY_ACTIVATION_BACKWARD_OP_SEQ, CUDA_PRIMITIVE_FLOATING_TYPE_SEQ)};
-
-#undef MAKE_NEW_BROADCAST_ELEMENTWISE_BINARY_COMPARASION_AND_LOGICAL_ENTRY
-#undef MAKE_NEW_BROADCAST_ELEMENTWISE_BINARY_MATH_ENTRY
-
-    const auto it = new_broadcast_elementwise_binary_handle.find(
-        std::make_tuple(binary_op, src_type, dst_type));
-    if (it != new_broadcast_elementwise_binary_handle.end()) {
-      return it->second(attr0, attr1);
-    } else {
-      return nullptr;
-    }
-  }
-};
-
-REGISTER_PRIMITIVE_FACTORY(DeviceType::kCUDA, BroadcastElementwiseBinaryFactory,
-                           BroadcastElementwiseBinaryFactoryImpl);
-}  // namespace
-}  // namespace broadcast_elementwise_binary
-}  // namespace primitive
-}  // namespace ep
-
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "oneflow/core/ep/include/primitive/broadcast_elementwise_binary.h"
+#include "oneflow/core/ep/common/primitive/broadcast_elementwise_binary.h"
+#include "oneflow/core/ep/rocm/primitive/type_seq.h"
+#include "oneflow/core/ep/rocm/cuda_stream.h"
+#include "oneflow/core/hip/elementwise.hip.h"
+#include "oneflow/core/ep/rocm/primitive/binary_functor.hip.h"
+
+namespace oneflow {
+
+namespace ep {
+namespace primitive {
+namespace broadcast_elementwise_binary {
+
+template<BinaryOp binary_op, typename Src, typename Dst>
+std::unique_ptr<BroadcastElementwiseBinary> NewBroadcastElementwiseBinary(Scalar attr0,
+                                                                          Scalar attr1);
+
+namespace {
+
+class BroadcastElementwiseBinaryFactoryImpl : public BroadcastElementwiseBinaryFactory {
+ public:
+  OF_DISALLOW_COPY_AND_MOVE(BroadcastElementwiseBinaryFactoryImpl);
+  BroadcastElementwiseBinaryFactoryImpl() = default;
+  ~BroadcastElementwiseBinaryFactoryImpl() override = default;
+
+  std::unique_ptr<BroadcastElementwiseBinary> New(BinaryOp op, DataType src_type, DataType dst_type,
+                                                  size_t max_num_dims) override {
+    return New(op, src_type, dst_type, max_num_dims, Scalar(), Scalar());
+  }
+
+  std::unique_ptr<BroadcastElementwiseBinary> New(BinaryOp op, DataType src_type, DataType dst_type,
+                                                  size_t max_num_dims, Scalar attr0) override {
+    return New(op, src_type, dst_type, max_num_dims, attr0, Scalar());
+  }
+
+  std::unique_ptr<BroadcastElementwiseBinary> New(BinaryOp binary_op, DataType src_type,
+                                                  DataType dst_type, size_t max_num_dims,
+                                                  Scalar attr0, Scalar attr1) override {
+    if (max_num_dims > kMaxNumDims) { return nullptr; }
+#define MAKE_NEW_BROADCAST_ELEMENTWISE_BINARY_MATH_ENTRY(binary_op, data_type_pair) \
+  {std::make_tuple(binary_op, OF_PP_PAIR_SECOND(data_type_pair),                    \
+                   OF_PP_PAIR_SECOND(data_type_pair)),                              \
+   NewBroadcastElementwiseBinary<binary_op, OF_PP_PAIR_FIRST(data_type_pair),       \
+                                 OF_PP_PAIR_FIRST(data_type_pair)>},
+
+#define MAKE_NEW_BROADCAST_ELEMENTWISE_BINARY_COMPARASION_AND_LOGICAL_ENTRY(      \
+    binary_op, src_data_type_pair, dst_data_type_pair)                            \
+  {std::make_tuple(binary_op, OF_PP_PAIR_SECOND(src_data_type_pair),              \
+                   OF_PP_PAIR_SECOND(dst_data_type_pair)),                        \
+   NewBroadcastElementwiseBinary<binary_op, OF_PP_PAIR_FIRST(src_data_type_pair), \
+                                 OF_PP_PAIR_FIRST(dst_data_type_pair)>},
+
+#define MAKE_NEW_BROADCAST_ELEMENTWISE_BINARY_ACTIVATION_GRAD_ENTRY(binary_op, data_type_pair) \
+  {std::make_tuple(binary_op, OF_PP_PAIR_SECOND(data_type_pair),                               \
+                   OF_PP_PAIR_SECOND(data_type_pair)),                                         \
+   NewBroadcastElementwiseBinary<binary_op, OF_PP_PAIR_FIRST(data_type_pair),                  \
+                                 OF_PP_PAIR_FIRST(data_type_pair)>},
+
+    static const std::map<
+        std::tuple<BinaryOp, DataType, DataType>,
+        std::function<std::unique_ptr<BroadcastElementwiseBinary>(Scalar, Scalar)>>
+        new_broadcast_elementwise_binary_handle{
+            OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(MAKE_NEW_BROADCAST_ELEMENTWISE_BINARY_MATH_ENTRY,
+                                             BINARY_MATH_OP_SEQ, CUDA_PRIMITIVE_ALL_TYPE_SEQ)
+
+                OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(
+                    MAKE_NEW_BROADCAST_ELEMENTWISE_BINARY_COMPARASION_AND_LOGICAL_ENTRY,
+                    BINARY_COMPARISION_OP_SEQ BINARY_LOGICAL_OP_SEQ, CUDA_PRIMITIVE_ALL_TYPE_SEQ,
+                    CUDA_PRIMITIVE_BOOL_TYPE_SEQ)
+
+                    OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(
+                        MAKE_NEW_BROADCAST_ELEMENTWISE_BINARY_ACTIVATION_GRAD_ENTRY,
+                        BINARY_ACTIVATION_BACKWARD_OP_SEQ, CUDA_PRIMITIVE_FLOATING_TYPE_SEQ)};
+
+#undef MAKE_NEW_BROADCAST_ELEMENTWISE_BINARY_COMPARASION_AND_LOGICAL_ENTRY
+#undef MAKE_NEW_BROADCAST_ELEMENTWISE_BINARY_MATH_ENTRY
+
+    const auto it = new_broadcast_elementwise_binary_handle.find(
+        std::make_tuple(binary_op, src_type, dst_type));
+    if (it != new_broadcast_elementwise_binary_handle.end()) {
+      return it->second(attr0, attr1);
+    } else {
+      return nullptr;
+    }
+  }
+};
+
+REGISTER_PRIMITIVE_FACTORY(DeviceType::kCUDA, BroadcastElementwiseBinaryFactory,
+                           BroadcastElementwiseBinaryFactoryImpl);
+}  // namespace
+}  // namespace broadcast_elementwise_binary
+}  // namespace primitive
+}  // namespace ep
+
 }  // namespace oneflow
\ No newline at end of file
--- a/oneflow/core/ep/rocm/primitive/broadcast_elementwise_binary.hip.h
+++ b/oneflow/core/ep/rocm/primitive/broadcast_elementwise_binary.hip.h
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#include "hip/hip_runtime.h"
-#include "oneflow/core/ep/include/primitive/broadcast_elementwise_binary.h"
-#include "oneflow/core/ep/common/primitive/broadcast_elementwise_binary.h"
-#include "oneflow/core/ep/rocm/primitive/type_seq.h"
-#include "oneflow/core/ep/rocm/cuda_stream.h"
-#include "oneflow/core/hip/elementwise.hip.h"
-#include "oneflow/core/ep/rocm/primitive/binary_functor.hip.h"
-
-namespace oneflow {
-
-namespace ep {
-namespace primitive {
-namespace broadcast_elementwise_binary {
-
-namespace {
-
-template<typename T, int N>
-struct GetPackType {
-  using type = typename std::aligned_storage<N * sizeof(T), N * sizeof(T)>::type;
-};
-
-template<typename T, int N>
-using PackType = typename GetPackType<T, N>::type;
-
-template<typename T, int N>
-union Pack {
-  static_assert(sizeof(PackType<T, N>) == sizeof(T) * N, "");
-  OF_DEVICE_FUNC Pack() {
-    // do nothing
-  }
-  PackType<T, N> storage;
-  T elem[N];
-};
-
-template<size_t max_dims, typename IndexType>
-struct BroadcastElementwiseBinaryParams {
-  NdIndexOffsetHelper<IndexType, max_dims> src0_index_helper;
-  NdIndexOffsetHelper<IndexType, max_dims> src1_index_helper;
-  NdIndexOffsetHelper<IndexType, max_dims> dst_index_helper;
-  size_t num_dims;
-  IndexType src0_index_mask[max_dims];
-  IndexType src1_index_mask[max_dims];
-  IndexType count{};
-  const void* src0{};
-  const void* src1{};
-  void* dst{};
-  Scalar attr0;
-  Scalar attr1;
-};
-
-template<BinaryOp binary_op, typename Src, typename Dst, size_t max_dims, size_t src0_pack_size,
-         size_t src1_pack_size, typename IndexType>
-__global__ void BroadcastElementwiseBinaryGpu(
-    BroadcastElementwiseBinaryParams<max_dims, IndexType> params) {
-  constexpr size_t dst_pack_size =
-      src0_pack_size > src1_pack_size ? src0_pack_size : src1_pack_size;
-  static_assert(src0_pack_size == dst_pack_size || src0_pack_size == 1, "");
-  static_assert(src1_pack_size == dst_pack_size || src1_pack_size == 1, "");
-
-  const PackType<Src, src0_pack_size>* src0 =
-      reinterpret_cast<const PackType<Src, src0_pack_size>*>(params.src0);
-  const PackType<Src, src1_pack_size>* src1 =
-      reinterpret_cast<const PackType<Src, src1_pack_size>*>(params.src1);
-  PackType<Dst, dst_pack_size>* dst = reinterpret_cast<PackType<Dst, dst_pack_size>*>(params.dst);
-
-  IndexType src0_index[max_dims];
-  IndexType src1_index[max_dims];
-  IndexType dst_index[max_dims];
-  size_t num_dims = params.num_dims;
-  CUDA_1D_KERNEL_LOOP_T(IndexType, offset, params.count) {
-    params.dst_index_helper.OffsetToNdIndex(offset, dst_index, num_dims);
-#pragma unroll
-    for (int i = 0; i < max_dims; ++i) {
-      if (i < num_dims) {
-        src0_index[i] = params.src0_index_mask[i] * dst_index[i];
-        src1_index[i] = params.src1_index_mask[i] * dst_index[i];
-      } else {
-        src0_index[i] = 0;
-        src1_index[i] = 0;
-      }
-    }
-    const IndexType src0_offset = params.src0_index_helper.NdIndexToOffset(src0_index, num_dims);
-    const IndexType src1_offset = params.src1_index_helper.NdIndexToOffset(src1_index, num_dims);
-    Pack<Src, src0_pack_size> src0_pack;
-    src0_pack.storage = src0[src0_offset];
-    Pack<Src, src1_pack_size> src1_pack;
-    src1_pack.storage = src1[src1_offset];
-    Pack<Dst, dst_pack_size> dst_pack;
-    BinaryFunctor<DeviceType::kCUDA, binary_op, Src, Dst> functor(params.attr0, params.attr1);
-#pragma unroll
-    for (int j = 0; j < dst_pack_size; ++j) {
-      const Src src0_val =
-          (src0_pack_size == dst_pack_size) ? src0_pack.elem[j] : src0_pack.elem[0];
-      const Src src1_val =
-          (src1_pack_size == dst_pack_size) ? src1_pack.elem[j] : src1_pack.elem[0];
-      dst_pack.elem[j] = functor(src0_val, src1_val);
-    }
-    dst[offset] = dst_pack.storage;
-  }
-}
-
-template<BinaryOp op, typename T, typename R, size_t max_dims, size_t src0_pack_size,
-         size_t src1_pack_size, typename IndexType>
-void LaunchKernel(Stream* stream, int num_dims, const int64_t* src0_dims, const void* src0,
-                  const int64_t* src1_dims, const void* src1, const int64_t* dst_dims, void* dst,
-                  size_t count, Scalar attr0, Scalar attr1) {
-  BroadcastElementwiseBinaryParams<max_dims, IndexType> params;
-  for (size_t i = 0; i < num_dims; ++i) {
-    params.src0_index_mask[i] = (src0_dims[i] == 1) ? 0 : 1;
-    params.src1_index_mask[i] = (src1_dims[i] == 1) ? 0 : 1;
-  }
-  params.src0_index_helper = NdIndexOffsetHelper<IndexType, max_dims>(src0_dims, num_dims);
-  params.src1_index_helper = NdIndexOffsetHelper<IndexType, max_dims>(src1_dims, num_dims);
-  params.dst_index_helper = NdIndexOffsetHelper<IndexType, max_dims>(dst_dims, num_dims);
-  params.num_dims = num_dims;
-  params.src0 = src0;
-  params.src1 = src1;
-  params.dst = dst;
-  params.count = static_cast<IndexType>(count);
-  params.attr0 = attr0;
-  params.attr1 = attr1;
-  auto* cuda_stream = stream->As<CudaStream>();
-  BroadcastElementwiseBinaryGpu<op, T, R, max_dims, src0_pack_size, src1_pack_size, IndexType>
-      <<<BlocksNum4ThreadsNum(params.count), kCudaThreadsNumPerBlock, 0,
-         cuda_stream->cuda_stream()>>>(params);
-}
-
-template<BinaryOp op, typename T, typename R, size_t max_dims, size_t src0_pack_size,
-         size_t src1_pack_size>
-void DispatchIndexType(Stream* stream, size_t num_dims, const int64_t* src0_dims, const void* src0,
-                       const int64_t* src1_dims, const void* src1, const int64_t* dst_dims,
-                       void* dst, Scalar attr0, Scalar attr1) {
-  size_t count = GetElementCount(num_dims, dst_dims);
-  if (count < GetMaxVal<int32_t>()) {
-    LaunchKernel<op, T, R, max_dims, src0_pack_size, src1_pack_size, int32_t>(
-        stream, num_dims, src0_dims, src0, src1_dims, src1, dst_dims, dst, count, attr0, attr1);
-  } else {
-    LaunchKernel<op, T, R, max_dims, src0_pack_size, src1_pack_size, int64_t>(
-        stream, num_dims, src0_dims, src0, src1_dims, src1, dst_dims, dst, count, attr0, attr1);
-  }
-}
-
-template<BinaryOp op, typename T, typename R, size_t max_dims>
-void DispatchPackSize(Stream* stream, size_t src0_pack_size, size_t src1_pack_size, size_t num_dims,
-                      const int64_t* src0_dims, const void* src0, const int64_t* src1_dims,
-                      const void* src1, const int64_t* dst_dims, void* dst, Scalar attr0,
-                      Scalar attr1) {
-  void (*func)(Stream* /*stream*/, size_t /*num_dims*/, const int64_t* /*src0_dims*/,
-               const void* /*src0*/, const int64_t* /*src1_dims*/, const void* /*src1*/,
-               const int64_t* /*dst_dims*/, void* /*dst*/, Scalar /*attr0*/, Scalar /*attr1*/) =
-      nullptr;
-  if (src0_pack_size == 1 && src1_pack_size == 1) {
-    func = DispatchIndexType<op, T, R, max_dims, 1, 1>;
-  } else if (src0_pack_size == 4 && src1_pack_size == 4) {
-    func = DispatchIndexType<op, T, R, max_dims, 4, 4>;
-  } else if (src0_pack_size == 1 && src1_pack_size == 4) {
-    func = DispatchIndexType<op, T, R, max_dims, 1, 4>;
-  } else if (src0_pack_size == 4 && src1_pack_size == 1) {
-    func = DispatchIndexType<op, T, R, max_dims, 4, 1>;
-  } else {
-    UNIMPLEMENTED();
-  }
-  func(stream, num_dims, src0_dims, src0, src1_dims, src1, dst_dims, dst, attr0, attr1);
-}
-
-template<BinaryOp op, typename T, typename R>
-void DispatchNumDims(Stream* stream, size_t src0_pack_size, size_t src1_pack_size, size_t num_dims,
-                     const int64_t* src0_dims, const void* src0, const int64_t* src1_dims,
-                     const void* src1, const int64_t* dst_dims, void* dst, Scalar attr0,
-                     Scalar attr1) {
-  void (*func)(Stream* /*stream*/, size_t /*src0_pack_size*/, size_t /*src1_pack_size*/,
-               size_t /*num_dims*/, const int64_t* /*src0_dims*/, const void* /*src0*/,
-               const int64_t* /*src1_dims*/, const void* /*src1*/, const int64_t* /*dst_dims*/,
-               void* /*dst*/, Scalar /*attr0*/, Scalar /*attr1*/) = nullptr;
-  CHECK_NE(num_dims, 1);
-  if (num_dims == 2) {
-    func = DispatchPackSize<op, T, R, 2>;
-  } else if (num_dims == 3) {
-    func = DispatchPackSize<op, T, R, 3>;
-  } else if (num_dims == 4) {
-    func = DispatchPackSize<op, T, R, 4>;
-  } else if (num_dims <= 8) {
-    func = DispatchPackSize<op, T, R, 8>;
-  } else {
-    UNIMPLEMENTED();
-  }
-  func(stream, src0_pack_size, src1_pack_size, num_dims, src0_dims, src0, src1_dims, src1, dst_dims,
-       dst, attr0, attr1);
-}
-
-template<size_t max_pack_size, typename T, typename R>
-size_t GetPackSize(size_t num_src_dims, const int64_t* src0_dims, const void* src0,
-                   const int64_t* src1_dims, const void* src1, void* dst) {
-  static_assert(max_pack_size > 0 && (max_pack_size & (max_pack_size - 1)) == 0, "");
-  CHECK(src0_dims[num_src_dims - 1] != 1 || src1_dims[num_src_dims - 1] != 1);
-  auto dst_ptr = reinterpret_cast<std::uintptr_t>(dst);
-  for (size_t pack_size = max_pack_size; pack_size > 2; pack_size /= 2) {
-    bool is_src0_supported = (src0_dims[num_src_dims - 1] == 1)
-                             || IsPackSizeSupported<T>(pack_size, num_src_dims, src0_dims, src0);
-    bool is_src1_supported = (src1_dims[num_src_dims - 1] == 1)
-                             || IsPackSizeSupported<T>(pack_size, num_src_dims, src1_dims, src1);
-    if (is_src0_supported && is_src1_supported && (dst_ptr % (pack_size * sizeof(R))) == 0) {
-      return pack_size;
-    }
-  }
-  return 1;
-}
-
-constexpr size_t kMaxPackSize = 4;
-
-template<BinaryOp op, typename T, typename R>
-void LaunchWithSimplified(Stream* stream, size_t simplified_num_dims, int64_t* simplified_src0_dims,
-                          const void* src0, int64_t* simplified_src1_dims, const void* src1,
-                          int64_t* simplified_dst_dims, void* dst, Scalar attr0, Scalar attr1) {
-  CHECK_LE(simplified_num_dims, kMaxNumDims);
-  size_t pack_size = GetPackSize<kMaxPackSize, T, R>(simplified_num_dims, simplified_src0_dims,
-                                                     src0, simplified_src1_dims, src1, dst);
-  size_t src0_pack_size = 1;
-  size_t src1_pack_size = 1;
-  if (simplified_src0_dims[simplified_num_dims - 1] != 1) {
-    simplified_src0_dims[simplified_num_dims - 1] /= pack_size;
-    src0_pack_size = pack_size;
-  }
-  if (simplified_src1_dims[simplified_num_dims - 1] != 1) {
-    simplified_src1_dims[simplified_num_dims - 1] /= pack_size;
-    src1_pack_size = pack_size;
-  }
-  simplified_dst_dims[simplified_num_dims - 1] /= pack_size;
-  DispatchNumDims<op, T, R>(stream, src0_pack_size, src1_pack_size, simplified_num_dims,
-                            simplified_src0_dims, src0, simplified_src1_dims, src1,
-                            simplified_dst_dims, dst, attr0, attr1);
-}
-
-template<BinaryOp binary_op, typename Src, typename Dst>
-struct BinaryLhsScalarFunctor {
-  __host__ __device__ BinaryLhsScalarFunctor(Src scalar, Scalar attr0, Scalar attr1)
-      : scalar(scalar), functor(attr0, attr1) {}
-  __device__ Dst operator()(Src src) const { return functor(scalar, src); }
-  const Src scalar;
-  BinaryFunctor<DeviceType::kCUDA, binary_op, Src, Dst> functor;
-};
-
-template<BinaryOp binary_op, typename Src, typename Dst>
-struct BinaryRhsScalarFunctor {
-  __host__ __device__ BinaryRhsScalarFunctor(Src scalar, Scalar attr0, Scalar attr1)
-      : scalar(scalar), functor(attr0, attr1) {}
-  __device__ Dst operator()(Src src) const { return functor(src, scalar); }
-  const Src scalar;
-  BinaryFunctor<DeviceType::kCUDA, binary_op, Src, Dst> functor;
-};
-
-template<BinaryOp binary_op, typename Src, typename Dst>
-struct BinaryLhsScalarPtrFunctorFactory {
-  __host__ __device__ BinaryLhsScalarPtrFunctorFactory(const Src* scalar_ptr, Scalar attr0,
-                                                       Scalar attr1)
-      : scalar_ptr(scalar_ptr), attr0(attr0), attr1(attr1) {}
-  __device__ BinaryLhsScalarFunctor<binary_op, Src, Dst> operator()() const {
-    return BinaryLhsScalarFunctor<binary_op, Src, Dst>(*scalar_ptr, attr0, attr1);
-  }
-  const Src* scalar_ptr;
-  Scalar attr0, attr1;
-};
-
-template<BinaryOp binary_op, typename Src, typename Dst>
-struct BinaryRhsScalarPtrFunctorFactory {
-  __host__ __device__ explicit BinaryRhsScalarPtrFunctorFactory(const Src* scalar_ptr, Scalar attr0,
-                                                                Scalar attr1)
-      : scalar_ptr(scalar_ptr), attr0(attr0), attr1(attr1) {}
-  __device__ BinaryRhsScalarFunctor<binary_op, Src, Dst> operator()() const {
-    return BinaryRhsScalarFunctor<binary_op, Src, Dst>(*scalar_ptr, attr0, attr1);
-  }
-  const Src* scalar_ptr;
-  Scalar attr0, attr1;
-};
-
-template<BinaryOp binary_op, typename Src, typename Dst>
-void DispatchLaunch(Stream* stream, size_t num_src0_dims, const int64_t* src0_dims, const Src* src0,
-                    size_t num_src1_dims, const int64_t* src1_dims, const Src* src1, Dst* dst,
-                    Scalar attr0, Scalar attr1) {
-  auto* cuda_stream = stream->As<CudaStream>();
-  size_t simplified_num_dims = 0;
-  int64_t simplified_src0_dims[kMaxNumDims];
-  int64_t simplified_src1_dims[kMaxNumDims];
-  int64_t simplified_dst_dims[kMaxNumDims];
-  SimplifyBroadcastDims<kMaxNumDims>(num_src0_dims, src0_dims, num_src1_dims, src1_dims,
-                                     &simplified_num_dims, simplified_src0_dims,
-                                     simplified_src1_dims, simplified_dst_dims);
-  CheckInplace(simplified_num_dims, simplified_src0_dims, src0, simplified_src1_dims, src1,
-               simplified_dst_dims, dst);
-  if (IsDimsEquals(simplified_num_dims, simplified_src0_dims, simplified_num_dims,
-                   simplified_src1_dims)) {
-    const int64_t elem_cnt = GetElementCount(simplified_num_dims, simplified_src0_dims);
-    OF_CUDA_CHECK((cuda::elementwise::Binary(
-        BinaryFunctor<DeviceType::kCUDA, binary_op, Src, Dst>(attr0, attr1), elem_cnt, dst, src0,
-        src1, cuda_stream->cuda_stream())));
-  } else {
-    if (simplified_num_dims == 1 && simplified_src0_dims[0] == 1) {
-      OF_CUDA_CHECK((cuda::elementwise::UnaryWithFactory(
-          BinaryLhsScalarPtrFunctorFactory<binary_op, Src, Dst>(src0, attr0, attr1),
-          simplified_src1_dims[0], dst, src1, cuda_stream->cuda_stream())));
-    } else if (simplified_num_dims == 1 && simplified_src1_dims[0] == 1) {
-      OF_CUDA_CHECK((cuda::elementwise::UnaryWithFactory(
-          BinaryRhsScalarPtrFunctorFactory<binary_op, Src, Dst>(src1, attr0, attr1),
-          simplified_src0_dims[0], dst, src0, cuda_stream->cuda_stream())));
-    } else {
-      LaunchWithSimplified<binary_op, Src, Dst>(stream, simplified_num_dims, simplified_src0_dims,
-                                                src0, simplified_src1_dims, src1,
-                                                simplified_dst_dims, dst, attr0, attr1);
-    }
-  }
-}
-
-template<typename T>
-T GetValue(Scalar value) {
-  return value.Value<T>();
-}
-
-template<>
-half GetValue<half>(Scalar value) {
-  return static_cast<half>(GetValue<float>(value));
-}
-
-// #if CUDA_VERSION >= 11000
-
-// template<>
-// nv_bfloat16 GetValue<nv_bfloat16>(Scalar value) {
-//   return static_cast<nv_bfloat16>(GetValue<float>(value));
-// }
-
-// #endif  // CUDA_VERSION >= 11000
-
-template<BinaryOp binary_op, typename Src, typename Dst>
-class BroadcastElementwiseBinaryImpl : public BroadcastElementwiseBinary {
- public:
-  OF_DISALLOW_COPY_AND_MOVE(BroadcastElementwiseBinaryImpl);
-  BroadcastElementwiseBinaryImpl(Scalar attr0, Scalar attr1) : attr0(attr0), attr1(attr1) {}
-  ~BroadcastElementwiseBinaryImpl() override = default;
-
-  void Launch(Stream* stream, Scalar src0, size_t num_src1_dims, const int64_t* src1_dims,
-              const void* src1, void* dst) override {
-    auto* cuda_stream = stream->As<CudaStream>();
-    const size_t elem_cnt = GetElementCount(num_src1_dims, src1_dims);
-    OF_CUDA_CHECK((cuda::elementwise::Unary(
-        BinaryLhsScalarFunctor<binary_op, Src, Dst>(GetValue<Src>(src0), attr0, attr1), elem_cnt,
-        reinterpret_cast<Dst*>(dst), reinterpret_cast<const Src*>(src1),
-        cuda_stream->cuda_stream())));
-  }
-  void Launch(Stream* stream, size_t num_src0_dims, const int64_t* src0_dims, const void* src0,
-              Scalar src1, void* dst) override {
-    auto* cuda_stream = stream->As<CudaStream>();
-    const size_t elem_cnt = GetElementCount(num_src0_dims, src0_dims);
-    OF_CUDA_CHECK((cuda::elementwise::Unary(
-        BinaryRhsScalarFunctor<binary_op, Src, Dst>(GetValue<Src>(src1), attr0, attr1), elem_cnt,
-        reinterpret_cast<Dst*>(dst), reinterpret_cast<const Src*>(src0),
-        cuda_stream->cuda_stream())));
-  }
-  void Launch(Stream* stream, size_t num_src0_dims, const int64_t* src0_dims, const void* src0,
-              size_t num_src1_dims, const int64_t* src1_dims, const void* src1,
-              void* dst) override {
-    DispatchLaunch<binary_op, Src, Dst>(
-        stream, num_src0_dims, src0_dims, reinterpret_cast<const Src*>(src0), num_src1_dims,
-        src1_dims, reinterpret_cast<const Src*>(src1), reinterpret_cast<Dst*>(dst), attr0, attr1);
-  }
-
- private:
-  Scalar attr0, attr1;
-};
-
-}  // namespace
-
-template<BinaryOp binary_op, typename Src, typename Dst>
-std::unique_ptr<BroadcastElementwiseBinary> NewBroadcastElementwiseBinary(Scalar attr0,
-                                                                          Scalar attr1) {
-  return std::unique_ptr<BroadcastElementwiseBinary>(
-      new BroadcastElementwiseBinaryImpl<binary_op, Src, Dst>(attr0, attr1));
-}
-
-}  // namespace broadcast_elementwise_binary
-}  // namespace primitive
-}  // namespace ep
-
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "hip/hip_runtime.h"
+#include "oneflow/core/ep/include/primitive/broadcast_elementwise_binary.h"
+#include "oneflow/core/ep/common/primitive/broadcast_elementwise_binary.h"
+#include "oneflow/core/ep/rocm/primitive/type_seq.h"
+#include "oneflow/core/ep/rocm/cuda_stream.h"
+#include "oneflow/core/hip/elementwise.hip.h"
+#include "oneflow/core/ep/rocm/primitive/binary_functor.hip.h"
+
+namespace oneflow {
+
+namespace ep {
+namespace primitive {
+namespace broadcast_elementwise_binary {
+
+namespace {
+
+template<typename T, int N>
+struct GetPackType {
+  using type = typename std::aligned_storage<N * sizeof(T), N * sizeof(T)>::type;
+};
+
+template<typename T, int N>
+using PackType = typename GetPackType<T, N>::type;
+
+template<typename T, int N>
+union Pack {
+  static_assert(sizeof(PackType<T, N>) == sizeof(T) * N, "");
+  OF_DEVICE_FUNC Pack() {
+    // do nothing
+  }
+  PackType<T, N> storage;
+  T elem[N];
+};
+
+template<size_t max_dims, typename IndexType>
+struct BroadcastElementwiseBinaryParams {
+  NdIndexOffsetHelper<IndexType, max_dims> src0_index_helper;
+  NdIndexOffsetHelper<IndexType, max_dims> src1_index_helper;
+  NdIndexOffsetHelper<IndexType, max_dims> dst_index_helper;
+  size_t num_dims;
+  IndexType src0_index_mask[max_dims];
+  IndexType src1_index_mask[max_dims];
+  IndexType count{};
+  const void* src0{};
+  const void* src1{};
+  void* dst{};
+  Scalar attr0;
+  Scalar attr1;
+};
+
+template<BinaryOp binary_op, typename Src, typename Dst, size_t max_dims, size_t src0_pack_size,
+         size_t src1_pack_size, typename IndexType>
+__global__ void BroadcastElementwiseBinaryGpu(
+    BroadcastElementwiseBinaryParams<max_dims, IndexType> params) {
+  constexpr size_t dst_pack_size =
+      src0_pack_size > src1_pack_size ? src0_pack_size : src1_pack_size;
+  static_assert(src0_pack_size == dst_pack_size || src0_pack_size == 1, "");
+  static_assert(src1_pack_size == dst_pack_size || src1_pack_size == 1, "");
+
+  const PackType<Src, src0_pack_size>* src0 =
+      reinterpret_cast<const PackType<Src, src0_pack_size>*>(params.src0);
+  const PackType<Src, src1_pack_size>* src1 =
+      reinterpret_cast<const PackType<Src, src1_pack_size>*>(params.src1);
+  PackType<Dst, dst_pack_size>* dst = reinterpret_cast<PackType<Dst, dst_pack_size>*>(params.dst);
+
+  IndexType src0_index[max_dims];
+  IndexType src1_index[max_dims];
+  IndexType dst_index[max_dims];
+  size_t num_dims = params.num_dims;
+  CUDA_1D_KERNEL_LOOP_T(IndexType, offset, params.count) {
+    params.dst_index_helper.OffsetToNdIndex(offset, dst_index, num_dims);
+#pragma unroll
+    for (int i = 0; i < max_dims; ++i) {
+      if (i < num_dims) {
+        src0_index[i] = params.src0_index_mask[i] * dst_index[i];
+        src1_index[i] = params.src1_index_mask[i] * dst_index[i];
+      } else {
+        src0_index[i] = 0;
+        src1_index[i] = 0;
+      }
+    }
+    const IndexType src0_offset = params.src0_index_helper.NdIndexToOffset(src0_index, num_dims);
+    const IndexType src1_offset = params.src1_index_helper.NdIndexToOffset(src1_index, num_dims);
+    Pack<Src, src0_pack_size> src0_pack;
+    src0_pack.storage = src0[src0_offset];
+    Pack<Src, src1_pack_size> src1_pack;
+    src1_pack.storage = src1[src1_offset];
+    Pack<Dst, dst_pack_size> dst_pack;
+    BinaryFunctor<DeviceType::kCUDA, binary_op, Src, Dst> functor(params.attr0, params.attr1);
+#pragma unroll
+    for (int j = 0; j < dst_pack_size; ++j) {
+      const Src src0_val =
+          (src0_pack_size == dst_pack_size) ? src0_pack.elem[j] : src0_pack.elem[0];
+      const Src src1_val =
+          (src1_pack_size == dst_pack_size) ? src1_pack.elem[j] : src1_pack.elem[0];
+      dst_pack.elem[j] = functor(src0_val, src1_val);
+    }
+    dst[offset] = dst_pack.storage;
+  }
+}
+
+template<BinaryOp op, typename T, typename R, size_t max_dims, size_t src0_pack_size,
+         size_t src1_pack_size, typename IndexType>
+void LaunchKernel(Stream* stream, int num_dims, const int64_t* src0_dims, const void* src0,
+                  const int64_t* src1_dims, const void* src1, const int64_t* dst_dims, void* dst,
+                  size_t count, Scalar attr0, Scalar attr1) {
+  BroadcastElementwiseBinaryParams<max_dims, IndexType> params;
+  for (size_t i = 0; i < num_dims; ++i) {
+    params.src0_index_mask[i] = (src0_dims[i] == 1) ? 0 : 1;
+    params.src1_index_mask[i] = (src1_dims[i] == 1) ? 0 : 1;
+  }
+  params.src0_index_helper = NdIndexOffsetHelper<IndexType, max_dims>(src0_dims, num_dims);
+  params.src1_index_helper = NdIndexOffsetHelper<IndexType, max_dims>(src1_dims, num_dims);
+  params.dst_index_helper = NdIndexOffsetHelper<IndexType, max_dims>(dst_dims, num_dims);
+  params.num_dims = num_dims;
+  params.src0 = src0;
+  params.src1 = src1;
+  params.dst = dst;
+  params.count = static_cast<IndexType>(count);
+  params.attr0 = attr0;
+  params.attr1 = attr1;
+  auto* cuda_stream = stream->As<CudaStream>();
+  BroadcastElementwiseBinaryGpu<op, T, R, max_dims, src0_pack_size, src1_pack_size, IndexType>
+      <<<BlocksNum4ThreadsNum(params.count), kCudaThreadsNumPerBlock, 0,
+         cuda_stream->cuda_stream()>>>(params);
+}
+
+template<BinaryOp op, typename T, typename R, size_t max_dims, size_t src0_pack_size,
+         size_t src1_pack_size>
+void DispatchIndexType(Stream* stream, size_t num_dims, const int64_t* src0_dims, const void* src0,
+                       const int64_t* src1_dims, const void* src1, const int64_t* dst_dims,
+                       void* dst, Scalar attr0, Scalar attr1) {
+  size_t count = GetElementCount(num_dims, dst_dims);
+  if (count < GetMaxVal<int32_t>()) {
+    LaunchKernel<op, T, R, max_dims, src0_pack_size, src1_pack_size, int32_t>(
+        stream, num_dims, src0_dims, src0, src1_dims, src1, dst_dims, dst, count, attr0, attr1);
+  } else {
+    LaunchKernel<op, T, R, max_dims, src0_pack_size, src1_pack_size, int64_t>(
+        stream, num_dims, src0_dims, src0, src1_dims, src1, dst_dims, dst, count, attr0, attr1);
+  }
+}
+
+template<BinaryOp op, typename T, typename R, size_t max_dims>
+void DispatchPackSize(Stream* stream, size_t src0_pack_size, size_t src1_pack_size, size_t num_dims,
+                      const int64_t* src0_dims, const void* src0, const int64_t* src1_dims,
+                      const void* src1, const int64_t* dst_dims, void* dst, Scalar attr0,
+                      Scalar attr1) {
+  void (*func)(Stream* /*stream*/, size_t /*num_dims*/, const int64_t* /*src0_dims*/,
+               const void* /*src0*/, const int64_t* /*src1_dims*/, const void* /*src1*/,
+               const int64_t* /*dst_dims*/, void* /*dst*/, Scalar /*attr0*/, Scalar /*attr1*/) =
+      nullptr;
+  if (src0_pack_size == 1 && src1_pack_size == 1) {
+    func = DispatchIndexType<op, T, R, max_dims, 1, 1>;
+  } else if (src0_pack_size == 4 && src1_pack_size == 4) {
+    func = DispatchIndexType<op, T, R, max_dims, 4, 4>;
+  } else if (src0_pack_size == 1 && src1_pack_size == 4) {
+    func = DispatchIndexType<op, T, R, max_dims, 1, 4>;
+  } else if (src0_pack_size == 4 && src1_pack_size == 1) {
+    func = DispatchIndexType<op, T, R, max_dims, 4, 1>;
+  } else {
+    UNIMPLEMENTED();
+  }
+  func(stream, num_dims, src0_dims, src0, src1_dims, src1, dst_dims, dst, attr0, attr1);
+}
+
+template<BinaryOp op, typename T, typename R>
+void DispatchNumDims(Stream* stream, size_t src0_pack_size, size_t src1_pack_size, size_t num_dims,
+                     const int64_t* src0_dims, const void* src0, const int64_t* src1_dims,
+                     const void* src1, const int64_t* dst_dims, void* dst, Scalar attr0,
+                     Scalar attr1) {
+  void (*func)(Stream* /*stream*/, size_t /*src0_pack_size*/, size_t /*src1_pack_size*/,
+               size_t /*num_dims*/, const int64_t* /*src0_dims*/, const void* /*src0*/,
+               const int64_t* /*src1_dims*/, const void* /*src1*/, const int64_t* /*dst_dims*/,
+               void* /*dst*/, Scalar /*attr0*/, Scalar /*attr1*/) = nullptr;
+  CHECK_NE(num_dims, 1);
+  if (num_dims == 2) {
+    func = DispatchPackSize<op, T, R, 2>;
+  } else if (num_dims == 3) {
+    func = DispatchPackSize<op, T, R, 3>;
+  } else if (num_dims == 4) {
+    func = DispatchPackSize<op, T, R, 4>;
+  } else if (num_dims <= 8) {
+    func = DispatchPackSize<op, T, R, 8>;
+  } else {
+    UNIMPLEMENTED();
+  }
+  func(stream, src0_pack_size, src1_pack_size, num_dims, src0_dims, src0, src1_dims, src1, dst_dims,
+       dst, attr0, attr1);
+}
+
+template<size_t max_pack_size, typename T, typename R>
+size_t GetPackSize(size_t num_src_dims, const int64_t* src0_dims, const void* src0,
+                   const int64_t* src1_dims, const void* src1, void* dst) {
+  static_assert(max_pack_size > 0 && (max_pack_size & (max_pack_size - 1)) == 0, "");
+  CHECK(src0_dims[num_src_dims - 1] != 1 || src1_dims[num_src_dims - 1] != 1);
+  auto dst_ptr = reinterpret_cast<std::uintptr_t>(dst);
+  for (size_t pack_size = max_pack_size; pack_size > 2; pack_size /= 2) {
+    bool is_src0_supported = (src0_dims[num_src_dims - 1] == 1)
+                             || IsPackSizeSupported<T>(pack_size, num_src_dims, src0_dims, src0);
+    bool is_src1_supported = (src1_dims[num_src_dims - 1] == 1)
+                             || IsPackSizeSupported<T>(pack_size, num_src_dims, src1_dims, src1);
+    if (is_src0_supported && is_src1_supported && (dst_ptr % (pack_size * sizeof(R))) == 0) {
+      return pack_size;
+    }
+  }
+  return 1;
+}
+
+constexpr size_t kMaxPackSize = 4;
+
+template<BinaryOp op, typename T, typename R>
+void LaunchWithSimplified(Stream* stream, size_t simplified_num_dims, int64_t* simplified_src0_dims,
+                          const void* src0, int64_t* simplified_src1_dims, const void* src1,
+                          int64_t* simplified_dst_dims, void* dst, Scalar attr0, Scalar attr1) {
+  CHECK_LE(simplified_num_dims, kMaxNumDims);
+  size_t pack_size = GetPackSize<kMaxPackSize, T, R>(simplified_num_dims, simplified_src0_dims,
+                                                     src0, simplified_src1_dims, src1, dst);
+  size_t src0_pack_size = 1;
+  size_t src1_pack_size = 1;
+  if (simplified_src0_dims[simplified_num_dims - 1] != 1) {
+    simplified_src0_dims[simplified_num_dims - 1] /= pack_size;
+    src0_pack_size = pack_size;
+  }
+  if (simplified_src1_dims[simplified_num_dims - 1] != 1) {
+    simplified_src1_dims[simplified_num_dims - 1] /= pack_size;
+    src1_pack_size = pack_size;
+  }
+  simplified_dst_dims[simplified_num_dims - 1] /= pack_size;
+  DispatchNumDims<op, T, R>(stream, src0_pack_size, src1_pack_size, simplified_num_dims,
+                            simplified_src0_dims, src0, simplified_src1_dims, src1,
+                            simplified_dst_dims, dst, attr0, attr1);
+}
+
+template<BinaryOp binary_op, typename Src, typename Dst>
+struct BinaryLhsScalarFunctor {
+  __host__ __device__ BinaryLhsScalarFunctor(Src scalar, Scalar attr0, Scalar attr1)
+      : scalar(scalar), functor(attr0, attr1) {}
+  __device__ Dst operator()(Src src) const { return functor(scalar, src); }
+  const Src scalar;
+  BinaryFunctor<DeviceType::kCUDA, binary_op, Src, Dst> functor;
+};
+
+template<BinaryOp binary_op, typename Src, typename Dst>
+struct BinaryRhsScalarFunctor {
+  __host__ __device__ BinaryRhsScalarFunctor(Src scalar, Scalar attr0, Scalar attr1)
+      : scalar(scalar), functor(attr0, attr1) {}
+  __device__ Dst operator()(Src src) const { return functor(src, scalar); }
+  const Src scalar;
+  BinaryFunctor<DeviceType::kCUDA, binary_op, Src, Dst> functor;
+};
+
+template<BinaryOp binary_op, typename Src, typename Dst>
+struct BinaryLhsScalarPtrFunctorFactory {
+  __host__ __device__ BinaryLhsScalarPtrFunctorFactory(const Src* scalar_ptr, Scalar attr0,
+                                                       Scalar attr1)
+      : scalar_ptr(scalar_ptr), attr0(attr0), attr1(attr1) {}
+  __device__ BinaryLhsScalarFunctor<binary_op, Src, Dst> operator()() const {
+    return BinaryLhsScalarFunctor<binary_op, Src, Dst>(*scalar_ptr, attr0, attr1);
+  }
+  const Src* scalar_ptr;
+  Scalar attr0, attr1;
+};
+
+template<BinaryOp binary_op, typename Src, typename Dst>
+struct BinaryRhsScalarPtrFunctorFactory {
+  __host__ __device__ explicit BinaryRhsScalarPtrFunctorFactory(const Src* scalar_ptr, Scalar attr0,
+                                                                Scalar attr1)
+      : scalar_ptr(scalar_ptr), attr0(attr0), attr1(attr1) {}
+  __device__ BinaryRhsScalarFunctor<binary_op, Src, Dst> operator()() const {
+    return BinaryRhsScalarFunctor<binary_op, Src, Dst>(*scalar_ptr, attr0, attr1);
+  }
+  const Src* scalar_ptr;
+  Scalar attr0, attr1;
+};
+
+template<BinaryOp binary_op, typename Src, typename Dst>
+void DispatchLaunch(Stream* stream, size_t num_src0_dims, const int64_t* src0_dims, const Src* src0,
+                    size_t num_src1_dims, const int64_t* src1_dims, const Src* src1, Dst* dst,
+                    Scalar attr0, Scalar attr1) {
+  auto* cuda_stream = stream->As<CudaStream>();
+  size_t simplified_num_dims = 0;
+  int64_t simplified_src0_dims[kMaxNumDims];
+  int64_t simplified_src1_dims[kMaxNumDims];
+  int64_t simplified_dst_dims[kMaxNumDims];
+  SimplifyBroadcastDims<kMaxNumDims>(num_src0_dims, src0_dims, num_src1_dims, src1_dims,
+                                     &simplified_num_dims, simplified_src0_dims,
+                                     simplified_src1_dims, simplified_dst_dims);
+  CheckInplace(simplified_num_dims, simplified_src0_dims, src0, simplified_src1_dims, src1,
+               simplified_dst_dims, dst);
+  if (IsDimsEquals(simplified_num_dims, simplified_src0_dims, simplified_num_dims,
+                   simplified_src1_dims)) {
+    const int64_t elem_cnt = GetElementCount(simplified_num_dims, simplified_src0_dims);
+    OF_CUDA_CHECK((cuda::elementwise::Binary(
+        BinaryFunctor<DeviceType::kCUDA, binary_op, Src, Dst>(attr0, attr1), elem_cnt, dst, src0,
+        src1, cuda_stream->cuda_stream())));
+  } else {
+    if (simplified_num_dims == 1 && simplified_src0_dims[0] == 1) {
+      OF_CUDA_CHECK((cuda::elementwise::UnaryWithFactory(
+          BinaryLhsScalarPtrFunctorFactory<binary_op, Src, Dst>(src0, attr0, attr1),
+          simplified_src1_dims[0], dst, src1, cuda_stream->cuda_stream())));
+    } else if (simplified_num_dims == 1 && simplified_src1_dims[0] == 1) {
+      OF_CUDA_CHECK((cuda::elementwise::UnaryWithFactory(
+          BinaryRhsScalarPtrFunctorFactory<binary_op, Src, Dst>(src1, attr0, attr1),
+          simplified_src0_dims[0], dst, src0, cuda_stream->cuda_stream())));
+    } else {
+      LaunchWithSimplified<binary_op, Src, Dst>(stream, simplified_num_dims, simplified_src0_dims,
+                                                src0, simplified_src1_dims, src1,
+                                                simplified_dst_dims, dst, attr0, attr1);
+    }
+  }
+}
+
+template<typename T>
+T GetValue(Scalar value) {
+  return value.Value<T>();
+}
+
+template<>
+half GetValue<half>(Scalar value) {
+  return static_cast<half>(GetValue<float>(value));
+}
+
+// #if CUDA_VERSION >= 11000
+
+// template<>
+// nv_bfloat16 GetValue<nv_bfloat16>(Scalar value) {
+//   return static_cast<nv_bfloat16>(GetValue<float>(value));
+// }
+
+// #endif  // CUDA_VERSION >= 11000
+
+template<BinaryOp binary_op, typename Src, typename Dst>
+class BroadcastElementwiseBinaryImpl : public BroadcastElementwiseBinary {
+ public:
+  OF_DISALLOW_COPY_AND_MOVE(BroadcastElementwiseBinaryImpl);
+  BroadcastElementwiseBinaryImpl(Scalar attr0, Scalar attr1) : attr0(attr0), attr1(attr1) {}
+  ~BroadcastElementwiseBinaryImpl() override = default;
+
+  void Launch(Stream* stream, Scalar src0, size_t num_src1_dims, const int64_t* src1_dims,
+              const void* src1, void* dst) override {
+    auto* cuda_stream = stream->As<CudaStream>();
+    const size_t elem_cnt = GetElementCount(num_src1_dims, src1_dims);
+    OF_CUDA_CHECK((cuda::elementwise::Unary(
+        BinaryLhsScalarFunctor<binary_op, Src, Dst>(GetValue<Src>(src0), attr0, attr1), elem_cnt,
+        reinterpret_cast<Dst*>(dst), reinterpret_cast<const Src*>(src1),
+        cuda_stream->cuda_stream())));
+  }
+  void Launch(Stream* stream, size_t num_src0_dims, const int64_t* src0_dims, const void* src0,
+              Scalar src1, void* dst) override {
+    auto* cuda_stream = stream->As<CudaStream>();
+    const size_t elem_cnt = GetElementCount(num_src0_dims, src0_dims);
+    OF_CUDA_CHECK((cuda::elementwise::Unary(
+        BinaryRhsScalarFunctor<binary_op, Src, Dst>(GetValue<Src>(src1), attr0, attr1), elem_cnt,
+        reinterpret_cast<Dst*>(dst), reinterpret_cast<const Src*>(src0),
+        cuda_stream->cuda_stream())));
+  }
+  void Launch(Stream* stream, size_t num_src0_dims, const int64_t* src0_dims, const void* src0,
+              size_t num_src1_dims, const int64_t* src1_dims, const void* src1,
+              void* dst) override {
+    DispatchLaunch<binary_op, Src, Dst>(
+        stream, num_src0_dims, src0_dims, reinterpret_cast<const Src*>(src0), num_src1_dims,
+        src1_dims, reinterpret_cast<const Src*>(src1), reinterpret_cast<Dst*>(dst), attr0, attr1);
+  }
+
+ private:
+  Scalar attr0, attr1;
+};
+
+}  // namespace
+
+template<BinaryOp binary_op, typename Src, typename Dst>
+std::unique_ptr<BroadcastElementwiseBinary> NewBroadcastElementwiseBinary(Scalar attr0,
+                                                                          Scalar attr1) {
+  return std::unique_ptr<BroadcastElementwiseBinary>(
+      new BroadcastElementwiseBinaryImpl<binary_op, Src, Dst>(attr0, attr1));
+}
+
+}  // namespace broadcast_elementwise_binary
+}  // namespace primitive
+}  // namespace ep
+
 }  // namespace oneflow
\ No newline at end of file
--- a/oneflow/core/ep/rocm/primitive/broadcast_elementwise_binary_activation_grad.hip.cpp
+++ b/oneflow/core/ep/rocm/primitive/broadcast_elementwise_binary_activation_grad.hip.cpp
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#include "oneflow/core/ep/rocm/primitive/broadcast_elementwise_binary.hip.h"
-
-namespace oneflow {
-
-namespace ep {
-namespace primitive {
-namespace broadcast_elementwise_binary {
-
-#define INSTANTIATE_NEW_BROADCAST_ELEMENTWISE_BINARY_ACTIVATION_GRAD_ENTRY(binary_op,      \
-                                                                           data_type_pair) \
-  template std::unique_ptr<BroadcastElementwiseBinary> NewBroadcastElementwiseBinary<      \
-      binary_op, OF_PP_PAIR_FIRST(data_type_pair), OF_PP_PAIR_FIRST(data_type_pair)>(      \
-      Scalar attr0, Scalar attr1);
-
-OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INSTANTIATE_NEW_BROADCAST_ELEMENTWISE_BINARY_ACTIVATION_GRAD_ENTRY,
-                                 BINARY_ACTIVATION_BACKWARD_OP_SEQ,
-                                 CUDA_PRIMITIVE_FLOATING_TYPE_SEQ);
-
-}  // namespace broadcast_elementwise_binary
-}  // namespace primitive
-}  // namespace ep
-
-}  // namespace oneflow
-
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "oneflow/core/ep/rocm/primitive/broadcast_elementwise_binary.hip.h"
+
+namespace oneflow {
+
+namespace ep {
+namespace primitive {
+namespace broadcast_elementwise_binary {
+
+#define INSTANTIATE_NEW_BROADCAST_ELEMENTWISE_BINARY_ACTIVATION_GRAD_ENTRY(binary_op,      \
+                                                                           data_type_pair) \
+  template std::unique_ptr<BroadcastElementwiseBinary> NewBroadcastElementwiseBinary<      \
+      binary_op, OF_PP_PAIR_FIRST(data_type_pair), OF_PP_PAIR_FIRST(data_type_pair)>(      \
+      Scalar attr0, Scalar attr1);
+
+OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INSTANTIATE_NEW_BROADCAST_ELEMENTWISE_BINARY_ACTIVATION_GRAD_ENTRY,
+                                 BINARY_ACTIVATION_BACKWARD_OP_SEQ,
+                                 CUDA_PRIMITIVE_FLOATING_TYPE_SEQ);
+
+}  // namespace broadcast_elementwise_binary
+}  // namespace primitive
+}  // namespace ep
+
+}  // namespace oneflow
+
--- a/oneflow/core/ep/rocm/primitive/broadcast_elementwise_binary_comparision.hip.cpp
+++ b/oneflow/core/ep/rocm/primitive/broadcast_elementwise_binary_comparision.hip.cpp
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#include "oneflow/core/ep/rocm/primitive/broadcast_elementwise_binary.hip.h"
-
-namespace oneflow {
-
-namespace ep {
-namespace primitive {
-namespace broadcast_elementwise_binary {
-
-#define INSTANTIATE_NEW_BROADCAST_ELEMENTWISE_BINARY_COMPARASION_ENTRY(                       \
-    binary_op, src_data_type_pair, dst_data_type_pair)                                        \
-  template std::unique_ptr<BroadcastElementwiseBinary> NewBroadcastElementwiseBinary<         \
-      binary_op, OF_PP_PAIR_FIRST(src_data_type_pair), OF_PP_PAIR_FIRST(dst_data_type_pair)>( \
-      Scalar attr0, Scalar attr1);
-
-OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INSTANTIATE_NEW_BROADCAST_ELEMENTWISE_BINARY_COMPARASION_ENTRY,
-                                 BINARY_COMPARISION_OP_SEQ, CUDA_PRIMITIVE_ALL_TYPE_SEQ,
-                                 CUDA_PRIMITIVE_BOOL_TYPE_SEQ);
-
-}  // namespace broadcast_elementwise_binary
-}  // namespace primitive
-}  // namespace ep
-
-}  // namespace oneflow
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "oneflow/core/ep/rocm/primitive/broadcast_elementwise_binary.hip.h"
+
+namespace oneflow {
+
+namespace ep {
+namespace primitive {
+namespace broadcast_elementwise_binary {
+
+#define INSTANTIATE_NEW_BROADCAST_ELEMENTWISE_BINARY_COMPARASION_ENTRY(                       \
+    binary_op, src_data_type_pair, dst_data_type_pair)                                        \
+  template std::unique_ptr<BroadcastElementwiseBinary> NewBroadcastElementwiseBinary<         \
+      binary_op, OF_PP_PAIR_FIRST(src_data_type_pair), OF_PP_PAIR_FIRST(dst_data_type_pair)>( \
+      Scalar attr0, Scalar attr1);
+
+OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INSTANTIATE_NEW_BROADCAST_ELEMENTWISE_BINARY_COMPARASION_ENTRY,
+                                 BINARY_COMPARISION_OP_SEQ, CUDA_PRIMITIVE_ALL_TYPE_SEQ,
+                                 CUDA_PRIMITIVE_BOOL_TYPE_SEQ);
+
+}  // namespace broadcast_elementwise_binary
+}  // namespace primitive
+}  // namespace ep
+
+}  // namespace oneflow
--- a/oneflow/core/ep/rocm/primitive/broadcast_elementwise_binary_logical.hip.cpp
+++ b/oneflow/core/ep/rocm/primitive/broadcast_elementwise_binary_logical.hip.cpp
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#include "oneflow/core/ep/rocm/primitive/broadcast_elementwise_binary.hip.h"
-
-namespace oneflow {
-
-namespace ep {
-namespace primitive {
-namespace broadcast_elementwise_binary {
-
-#define INSTANTIATE_NEW_BROADCAST_ELEMENTWISE_BINARY_LOGICAL_ENTRY(binary_op, src_data_type_pair, \
-                                                                   dst_data_type_pair)            \
-  template std::unique_ptr<BroadcastElementwiseBinary> NewBroadcastElementwiseBinary<             \
-      binary_op, OF_PP_PAIR_FIRST(src_data_type_pair), OF_PP_PAIR_FIRST(dst_data_type_pair)>(     \
-      Scalar attr0, Scalar attr1);
-
-OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INSTANTIATE_NEW_BROADCAST_ELEMENTWISE_BINARY_LOGICAL_ENTRY,
-                                 BINARY_COMPARISION_OP_SEQ BINARY_LOGICAL_OP_SEQ,
-                                 CUDA_PRIMITIVE_ALL_TYPE_SEQ, CUDA_PRIMITIVE_BOOL_TYPE_SEQ);
-
-}  // namespace broadcast_elementwise_binary
-}  // namespace primitive
-}  // namespace ep
-
-}  // namespace oneflow
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "oneflow/core/ep/rocm/primitive/broadcast_elementwise_binary.hip.h"
+
+namespace oneflow {
+
+namespace ep {
+namespace primitive {
+namespace broadcast_elementwise_binary {
+
+#define INSTANTIATE_NEW_BROADCAST_ELEMENTWISE_BINARY_LOGICAL_ENTRY(binary_op, src_data_type_pair, \
+                                                                   dst_data_type_pair)            \
+  template std::unique_ptr<BroadcastElementwiseBinary> NewBroadcastElementwiseBinary<             \
+      binary_op, OF_PP_PAIR_FIRST(src_data_type_pair), OF_PP_PAIR_FIRST(dst_data_type_pair)>(     \
+      Scalar attr0, Scalar attr1);
+
+OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INSTANTIATE_NEW_BROADCAST_ELEMENTWISE_BINARY_LOGICAL_ENTRY,
+                                 BINARY_COMPARISION_OP_SEQ BINARY_LOGICAL_OP_SEQ,
+                                 CUDA_PRIMITIVE_ALL_TYPE_SEQ, CUDA_PRIMITIVE_BOOL_TYPE_SEQ);
+
+}  // namespace broadcast_elementwise_binary
+}  // namespace primitive
+}  // namespace ep
+
+}  // namespace oneflow
--- a/oneflow/core/ep/rocm/primitive/broadcast_elementwise_binary_math.hip.cpp
+++ b/oneflow/core/ep/rocm/primitive/broadcast_elementwise_binary_math.hip.cpp
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#include "oneflow/core/ep/rocm/primitive/broadcast_elementwise_binary.hip.h"
-
-namespace oneflow {
-
-namespace ep {
-namespace primitive {
-namespace broadcast_elementwise_binary {
-
-#define INSTANTIATE_NEW_BROADCAST_ELEMENTWISE_BINARY_MATH_ENTRY(binary_op, data_type_pair) \
-  template std::unique_ptr<BroadcastElementwiseBinary> NewBroadcastElementwiseBinary<      \
-      binary_op, OF_PP_PAIR_FIRST(data_type_pair), OF_PP_PAIR_FIRST(data_type_pair)>(      \
-      Scalar attr0, Scalar attr1);
-
-OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INSTANTIATE_NEW_BROADCAST_ELEMENTWISE_BINARY_MATH_ENTRY,
-                                 BINARY_MATH_OP_SEQ, CUDA_PRIMITIVE_ALL_TYPE_SEQ);
-
-}  // namespace broadcast_elementwise_binary
-}  // namespace primitive
-}  // namespace ep
-
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "oneflow/core/ep/rocm/primitive/broadcast_elementwise_binary.hip.h"
+
+namespace oneflow {
+
+namespace ep {
+namespace primitive {
+namespace broadcast_elementwise_binary {
+
+#define INSTANTIATE_NEW_BROADCAST_ELEMENTWISE_BINARY_MATH_ENTRY(binary_op, data_type_pair) \
+  template std::unique_ptr<BroadcastElementwiseBinary> NewBroadcastElementwiseBinary<      \
+      binary_op, OF_PP_PAIR_FIRST(data_type_pair), OF_PP_PAIR_FIRST(data_type_pair)>(      \
+      Scalar attr0, Scalar attr1);
+
+OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INSTANTIATE_NEW_BROADCAST_ELEMENTWISE_BINARY_MATH_ENTRY,
+                                 BINARY_MATH_OP_SEQ, CUDA_PRIMITIVE_ALL_TYPE_SEQ);
+
+}  // namespace broadcast_elementwise_binary
+}  // namespace primitive
+}  // namespace ep
+
 }  // namespace oneflow
\ No newline at end of file
--- a/oneflow/core/ep/rocm/primitive/broadcast_matmul.cpp
+++ b/oneflow/core/ep/rocm/primitive/broadcast_matmul.cpp
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#ifdef WITH_ROCM
-
-#include "oneflow/core/ep/include/primitive/primitive.h"
-#include "oneflow/core/ep/include/primitive/broadcast_matmul.h"
-#include "oneflow/core/ep/common/primitive/broadcast_matmul.h"
-#include "oneflow/core/common/optional.h"
-#include "oneflow/core/device/cuda_util.h"
-#include "oneflow/core/ep/rocm/cuda_stream.h"
-#include <hip/hip_runtime.h>
-#include <hip/hip_fp16.h>
-namespace oneflow {
-
-namespace ep {
-namespace primitive {
-
-namespace broadcast_matmul {
-
-namespace internal {
-
-namespace {
-
-constexpr size_t kMaxNumDims = 8;
-
-Optional<hipblasDatatype_t> OptCudaDataType(DataType data_type) {
-  switch (data_type) {
-    case kFloat: return HIPBLAS_R_32F;
-    case kDouble: return HIPBLAS_R_64F;
-    case kFloat16: return HIPBLAS_R_16F;
-// #if CUDA_VERSION >= 11000
-//     case kBFloat16: return CUDA_R_16BF;
-// #endif  // CUDA_VERSION >= 11000
-    default: return NullOpt;
-  }
-}
-
-hipblasDatatype_t GetCudaDataType(DataType data_type) {
-  auto cuda_data_type = OptCudaDataType(data_type);
-  CHECK(cuda_data_type.has_value());
-  return cuda_data_type.value_or(HIPBLAS_R_32F);
-}
-
-union CublasScalarParameter {
-  double d;
-  float s;
-};
-
-CublasScalarParameter GetCublasScalarParameter(Scalar scalar, hipblasDatatype_t compute_type) {
-  CublasScalarParameter sp{};
-  if (compute_type == HIPBLAS_R_64F) {
-    sp.d = scalar.Value<double>();
-  } else if (compute_type == HIPBLAS_R_32F) {
-    sp.s = scalar.Value<float>();
-  } else if (compute_type == HIPBLAS_R_16F) {
-    sp.s = scalar.Value<float>();
-  } else {
-    UNIMPLEMENTED();
-  }
-  return sp;
-}
-
-hipblasDatatype_t GetComputeType(DataType data_type) {
-  switch (data_type) {
-    case kFloat: return HIPBLAS_R_32F;
-    case kDouble: return HIPBLAS_R_64F;
-    case kFloat16: return HIPBLAS_R_16F;
-// #if CUDA_VERSION >= 11000
-//     case kBFloat16: return HIPBLAS_R_32F;
-// #endif  // CUDA_VERSION >= 11000
-    default: UNIMPLEMENTED(); return HIPBLAS_R_32F;
-  }
-}
-
-void LaunchBroadcastMatmul(Stream* stream, DataType data_type, BlasTransposeType transpose_a,
-                           BlasTransposeType transpose_b, int64_t num_batch_dims,
-                           const int64_t* broadcast_batch_dims, const int64_t* a_batch_dims,
-                           const int64_t* b_batch_dims, const int64_t* c_batch_dims, int64_t m,
-                           int64_t n, int64_t k, Scalar alpha, const void* a, const void* b,
-                           Scalar beta, void* c) {
-  auto* cuda_stream = stream->As<CudaStream>();
-  const auto cuda_data_type = GetCudaDataType(data_type);
-  const auto compute_type = GetComputeType(data_type);
-  const auto sp_alpha = GetCublasScalarParameter(alpha, compute_type);
-  __half h_alpha = 0;
-  if (compute_type == HIPBLAS_R_16F) {
-      h_alpha = __float2half(sp_alpha.s);
-  }
-  const auto GetCublasOperation = [](BlasTransposeType transpose_type) {
-    if (transpose_type == BlasTransposeType::N) {
-      return HIPBLAS_OP_N;
-    } else if (transpose_type == BlasTransposeType::T) {
-      return HIPBLAS_OP_T;
-    } else {
-      UNIMPLEMENTED();
-      return HIPBLAS_OP_N;
-    }
-  };
-  const hipblasOperation_t cublas_trans_a = GetCublasOperation(transpose_b);
-  const hipblasOperation_t cublas_trans_b = GetCublasOperation(transpose_a);
-  const int cublas_m = n;
-  const int cublas_n = m;
-  const int cublas_k = k;
-  int cublas_lda = 0;
-  if (transpose_b == BlasTransposeType::N) {
-    cublas_lda = n;
-  } else if (transpose_b == BlasTransposeType::T) {
-    cublas_lda = k;
-  } else {
-    UNIMPLEMENTED();
-  }
-  int cublas_ldb = 0;
-  if (transpose_a == BlasTransposeType::N) {
-    cublas_ldb = k;
-  } else if (transpose_a == BlasTransposeType::T) {
-    cublas_ldb = m;
-  } else {
-    UNIMPLEMENTED();
-  }
-  const int cublas_ldc = n;
-  // CublasMathModeGuard guard(cuda_stream->cublas_handle());
-//   if (data_type == DataType::kFloat16) {
-// #if CUDA_VERSION < 11000
-//     guard.SetMathMode(CUBLAS_TENSOR_OP_MATH);
-// #else
-//     guard.SetMathMode(CUBLAS_DEFAULT_MATH);
-// #endif  // CUDA_VERSION < 11000
-//   }
-// #if CUDA_VERSION >= 11000
-//   hipblasGemmAlgo_t algo = HIPBLAS_GEMM_DEFAULT;
-  hipblasGemmAlgo_t algo = HIPBLAS_GEMM_DEFAULT;  
-// #else
-//   hipblasGemmAlgo_t algo =
-//       (data_type == DataType::kFloat16) ? CUBLAS_GEMM_DFALT_TENSOR_OP : HIPBLAS_GEMM_DEFAULT;
-// #endif
-  if (num_batch_dims == 1 && c_batch_dims[0] != 1) {
-    const void* cublas_a = b;
-    const void* cublas_b = a;
-    void* cublas_c = c;
-    const int64_t a_batch_count = a_batch_dims[0];
-    const int64_t b_batch_count = b_batch_dims[0];
-    CHECK(a_batch_count == 1 || b_batch_count == 1 || a_batch_count == b_batch_count);
-    CHECK_GT(a_batch_count, 0);
-    CHECK_GT(b_batch_count, 0);
-    const int batch_count = std::max(a_batch_count, b_batch_count);
-    const long long int cublas_stride_a = b_batch_count == 1 ? 0 : cublas_m * cublas_k;
-    const long long int cublas_stride_b = a_batch_count == 1 ? 0 : cublas_k * cublas_n;
-    const long long int cublas_stride_c = cublas_m * cublas_n;
-    const auto sp_beta = GetCublasScalarParameter(beta, compute_type);
-    __half h_beta = 0;
-    if (compute_type == HIPBLAS_R_16F) {
-      h_beta = __float2half(sp_beta.s);
-      OF_CUBLAS_CHECK(hipblasGemmStridedBatchedEx(
-        cuda_stream->cublas_handle(), cublas_trans_a, cublas_trans_b, cublas_m, cublas_n, cublas_k,
-        &h_alpha, cublas_a, cuda_data_type, cublas_lda, cublas_stride_a, cublas_b, cuda_data_type,
-        cublas_ldb, cublas_stride_b, &h_beta, cublas_c, cuda_data_type, cublas_ldc,
-        cublas_stride_c, batch_count, compute_type, algo));
-    } else {
-      OF_CUBLAS_CHECK(hipblasGemmStridedBatchedEx(
-        cuda_stream->cublas_handle(), cublas_trans_a, cublas_trans_b, cublas_m, cublas_n, cublas_k,
-        &sp_alpha, cublas_a, cuda_data_type, cublas_lda, cublas_stride_a, cublas_b, cuda_data_type,
-        cublas_ldb, cublas_stride_b, &sp_beta, cublas_c, cuda_data_type, cublas_ldc,
-        cublas_stride_c, batch_count, compute_type, algo));
-    }
-    
-  } else {
-    auto func = [&](const void* batch_a, const void* batch_b, void* batch_c, Scalar batch_beta) {
-      const auto sp_beta = GetCublasScalarParameter(batch_beta, compute_type);
-      __half h_beta = 0;
-      const void* cublas_a = batch_b;
-      const void* cublas_b = batch_a;
-      void* cublas_c = batch_c;
-      if (compute_type == HIPBLAS_R_16F) {
-        h_beta = __float2half(sp_beta.s);
-        OF_CUBLAS_CHECK(hipblasGemmEx(
-          cuda_stream->cublas_handle(), cublas_trans_a, cublas_trans_b, cublas_m, cublas_n,
-          cublas_k, &h_alpha, cublas_a, cuda_data_type, cublas_lda, cublas_b, cuda_data_type,
-          cublas_ldb, &h_beta, cublas_c, cuda_data_type, cublas_ldc, compute_type, algo));
-      } else {
-        OF_CUBLAS_CHECK(hipblasGemmEx(
-          cuda_stream->cublas_handle(), cublas_trans_a, cublas_trans_b, cublas_m, cublas_n,
-          cublas_k, &sp_alpha, cublas_a, cuda_data_type, cublas_lda, cublas_b, cuda_data_type,
-          cublas_ldb, &sp_beta, cublas_c, cuda_data_type, cublas_ldc, compute_type, algo));
-      }
-      
-    };
-    ForEachMatmul<kMaxNumDims>(data_type, m, n, k, beta, num_batch_dims, broadcast_batch_dims,
-                               a_batch_dims, b_batch_dims, c_batch_dims, a, b, c, func);
-  }
-}
-
-class BroadcastMatmulFactoryImpl : public BroadcastMatmulFactory {
- public:
-  OF_DISALLOW_COPY_AND_MOVE(BroadcastMatmulFactoryImpl);
-  BroadcastMatmulFactoryImpl() = default;
-  ~BroadcastMatmulFactoryImpl() override = default;
-
-  std::unique_ptr<BroadcastMatmul> New(DataType data_type, BlasTransposeType transpose_a,
-                                       BlasTransposeType transpose_b,
-                                       size_t max_num_dims) override {
-    auto cuda_data_type = OptCudaDataType(data_type);
-    if (max_num_dims <= kMaxNumDims && cuda_data_type.has_value()) {
-      return std::make_unique<BroadcastMatmulImpl<kMaxNumDims>>(data_type, transpose_a,
-                                                                transpose_b);
-    } else {
-      return nullptr;
-    }
-  }
-};
-
-REGISTER_PRIMITIVE_FACTORY(DeviceType::kCUDA, BroadcastMatmulFactory, BroadcastMatmulFactoryImpl);
-
-}  // namespace
-
-}  // namespace internal
-
-}  // namespace broadcast_matmul
-
-}  // namespace primitive
-}  // namespace ep
-
-}  // namespace oneflow
-
-#endif  // WITH_ROCM
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#ifdef WITH_ROCM
+
+#include "oneflow/core/ep/include/primitive/primitive.h"
+#include "oneflow/core/ep/include/primitive/broadcast_matmul.h"
+#include "oneflow/core/ep/common/primitive/broadcast_matmul.h"
+#include "oneflow/core/common/optional.h"
+#include "oneflow/core/device/cuda_util.h"
+#include "oneflow/core/ep/rocm/cuda_stream.h"
+#include <hip/hip_runtime.h>
+#include <hip/hip_fp16.h>
+namespace oneflow {
+
+namespace ep {
+namespace primitive {
+
+namespace broadcast_matmul {
+
+namespace internal {
+
+namespace {
+
+constexpr size_t kMaxNumDims = 8;
+
+Optional<hipblasDatatype_t> OptCudaDataType(DataType data_type) {
+  switch (data_type) {
+    case kFloat: return HIPBLAS_R_32F;
+    case kDouble: return HIPBLAS_R_64F;
+    case kFloat16: return HIPBLAS_R_16F;
+// #if CUDA_VERSION >= 11000
+//     case kBFloat16: return CUDA_R_16BF;
+// #endif  // CUDA_VERSION >= 11000
+    default: return NullOpt;
+  }
+}
+
+hipblasDatatype_t GetCudaDataType(DataType data_type) {
+  auto cuda_data_type = OptCudaDataType(data_type);
+  CHECK(cuda_data_type.has_value());
+  return cuda_data_type.value_or(HIPBLAS_R_32F);
+}
+
+union CublasScalarParameter {
+  double d;
+  float s;
+};
+
+CublasScalarParameter GetCublasScalarParameter(Scalar scalar, hipblasDatatype_t compute_type) {
+  CublasScalarParameter sp{};
+  if (compute_type == HIPBLAS_R_64F) {
+    sp.d = scalar.Value<double>();
+  } else if (compute_type == HIPBLAS_R_32F) {
+    sp.s = scalar.Value<float>();
+  } else if (compute_type == HIPBLAS_R_16F) {
+    sp.s = scalar.Value<float>();
+  } else {
+    UNIMPLEMENTED();
+  }
+  return sp;
+}
+
+hipblasDatatype_t GetComputeType(DataType data_type) {
+  switch (data_type) {
+    case kFloat: return HIPBLAS_R_32F;
+    case kDouble: return HIPBLAS_R_64F;
+    case kFloat16: return HIPBLAS_R_16F;
+// #if CUDA_VERSION >= 11000
+//     case kBFloat16: return HIPBLAS_R_32F;
+// #endif  // CUDA_VERSION >= 11000
+    default: UNIMPLEMENTED(); return HIPBLAS_R_32F;
+  }
+}
+
+void LaunchBroadcastMatmul(Stream* stream, DataType data_type, BlasTransposeType transpose_a,
+                           BlasTransposeType transpose_b, int64_t num_batch_dims,
+                           const int64_t* broadcast_batch_dims, const int64_t* a_batch_dims,
+                           const int64_t* b_batch_dims, const int64_t* c_batch_dims, int64_t m,
+                           int64_t n, int64_t k, Scalar alpha, const void* a, const void* b,
+                           Scalar beta, void* c) {
+  auto* cuda_stream = stream->As<CudaStream>();
+  const auto cuda_data_type = GetCudaDataType(data_type);
+  const auto compute_type = GetComputeType(data_type);
+  const auto sp_alpha = GetCublasScalarParameter(alpha, compute_type);
+  __half h_alpha = 0;
+  if (compute_type == HIPBLAS_R_16F) {
+      h_alpha = __float2half(sp_alpha.s);
+  }
+  const auto GetCublasOperation = [](BlasTransposeType transpose_type) {
+    if (transpose_type == BlasTransposeType::N) {
+      return HIPBLAS_OP_N;
+    } else if (transpose_type == BlasTransposeType::T) {
+      return HIPBLAS_OP_T;
+    } else {
+      UNIMPLEMENTED();
+      return HIPBLAS_OP_N;
+    }
+  };
+  const hipblasOperation_t cublas_trans_a = GetCublasOperation(transpose_b);
+  const hipblasOperation_t cublas_trans_b = GetCublasOperation(transpose_a);
+  const int cublas_m = n;
+  const int cublas_n = m;
+  const int cublas_k = k;
+  int cublas_lda = 0;
+  if (transpose_b == BlasTransposeType::N) {
+    cublas_lda = n;
+  } else if (transpose_b == BlasTransposeType::T) {
+    cublas_lda = k;
+  } else {
+    UNIMPLEMENTED();
+  }
+  int cublas_ldb = 0;
+  if (transpose_a == BlasTransposeType::N) {
+    cublas_ldb = k;
+  } else if (transpose_a == BlasTransposeType::T) {
+    cublas_ldb = m;
+  } else {
+    UNIMPLEMENTED();
+  }
+  const int cublas_ldc = n;
+  // CublasMathModeGuard guard(cuda_stream->cublas_handle());
+//   if (data_type == DataType::kFloat16) {
+// #if CUDA_VERSION < 11000
+//     guard.SetMathMode(CUBLAS_TENSOR_OP_MATH);
+// #else
+//     guard.SetMathMode(CUBLAS_DEFAULT_MATH);
+// #endif  // CUDA_VERSION < 11000
+//   }
+// #if CUDA_VERSION >= 11000
+//   hipblasGemmAlgo_t algo = HIPBLAS_GEMM_DEFAULT;
+  hipblasGemmAlgo_t algo = HIPBLAS_GEMM_DEFAULT;  
+// #else
+//   hipblasGemmAlgo_t algo =
+//       (data_type == DataType::kFloat16) ? CUBLAS_GEMM_DFALT_TENSOR_OP : HIPBLAS_GEMM_DEFAULT;
+// #endif
+  if (num_batch_dims == 1 && c_batch_dims[0] != 1) {
+    const void* cublas_a = b;
+    const void* cublas_b = a;
+    void* cublas_c = c;
+    const int64_t a_batch_count = a_batch_dims[0];
+    const int64_t b_batch_count = b_batch_dims[0];
+    CHECK(a_batch_count == 1 || b_batch_count == 1 || a_batch_count == b_batch_count);
+    CHECK_GT(a_batch_count, 0);
+    CHECK_GT(b_batch_count, 0);
+    const int batch_count = std::max(a_batch_count, b_batch_count);
+    const long long int cublas_stride_a = b_batch_count == 1 ? 0 : cublas_m * cublas_k;
+    const long long int cublas_stride_b = a_batch_count == 1 ? 0 : cublas_k * cublas_n;
+    const long long int cublas_stride_c = cublas_m * cublas_n;
+    const auto sp_beta = GetCublasScalarParameter(beta, compute_type);
+    __half h_beta = 0;
+    if (compute_type == HIPBLAS_R_16F) {
+      h_beta = __float2half(sp_beta.s);
+      OF_CUBLAS_CHECK(hipblasGemmStridedBatchedEx(
+        cuda_stream->cublas_handle(), cublas_trans_a, cublas_trans_b, cublas_m, cublas_n, cublas_k,
+        &h_alpha, cublas_a, cuda_data_type, cublas_lda, cublas_stride_a, cublas_b, cuda_data_type,
+        cublas_ldb, cublas_stride_b, &h_beta, cublas_c, cuda_data_type, cublas_ldc,
+        cublas_stride_c, batch_count, compute_type, algo));
+    } else {
+      OF_CUBLAS_CHECK(hipblasGemmStridedBatchedEx(
+        cuda_stream->cublas_handle(), cublas_trans_a, cublas_trans_b, cublas_m, cublas_n, cublas_k,
+        &sp_alpha, cublas_a, cuda_data_type, cublas_lda, cublas_stride_a, cublas_b, cuda_data_type,
+        cublas_ldb, cublas_stride_b, &sp_beta, cublas_c, cuda_data_type, cublas_ldc,
+        cublas_stride_c, batch_count, compute_type, algo));
+    }
+    
+  } else {
+    auto func = [&](const void* batch_a, const void* batch_b, void* batch_c, Scalar batch_beta) {
+      const auto sp_beta = GetCublasScalarParameter(batch_beta, compute_type);
+      __half h_beta = 0;
+      const void* cublas_a = batch_b;
+      const void* cublas_b = batch_a;
+      void* cublas_c = batch_c;
+      if (compute_type == HIPBLAS_R_16F) {
+        h_beta = __float2half(sp_beta.s);
+        OF_CUBLAS_CHECK(hipblasGemmEx(
+          cuda_stream->cublas_handle(), cublas_trans_a, cublas_trans_b, cublas_m, cublas_n,
+          cublas_k, &h_alpha, cublas_a, cuda_data_type, cublas_lda, cublas_b, cuda_data_type,
+          cublas_ldb, &h_beta, cublas_c, cuda_data_type, cublas_ldc, compute_type, algo));
+      } else {
+        OF_CUBLAS_CHECK(hipblasGemmEx(
+          cuda_stream->cublas_handle(), cublas_trans_a, cublas_trans_b, cublas_m, cublas_n,
+          cublas_k, &sp_alpha, cublas_a, cuda_data_type, cublas_lda, cublas_b, cuda_data_type,
+          cublas_ldb, &sp_beta, cublas_c, cuda_data_type, cublas_ldc, compute_type, algo));
+      }
+      
+    };
+    ForEachMatmul<kMaxNumDims>(data_type, m, n, k, beta, num_batch_dims, broadcast_batch_dims,
+                               a_batch_dims, b_batch_dims, c_batch_dims, a, b, c, func);
+  }
+}
+
+class BroadcastMatmulFactoryImpl : public BroadcastMatmulFactory {
+ public:
+  OF_DISALLOW_COPY_AND_MOVE(BroadcastMatmulFactoryImpl);
+  BroadcastMatmulFactoryImpl() = default;
+  ~BroadcastMatmulFactoryImpl() override = default;
+
+  std::unique_ptr<BroadcastMatmul> New(DataType data_type, BlasTransposeType transpose_a,
+                                       BlasTransposeType transpose_b,
+                                       size_t max_num_dims) override {
+    auto cuda_data_type = OptCudaDataType(data_type);
+    if (max_num_dims <= kMaxNumDims && cuda_data_type.has_value()) {
+      return std::make_unique<BroadcastMatmulImpl<kMaxNumDims>>(data_type, transpose_a,
+                                                                transpose_b);
+    } else {
+      return nullptr;
+    }
+  }
+};
+
+REGISTER_PRIMITIVE_FACTORY(DeviceType::kCUDA, BroadcastMatmulFactory, BroadcastMatmulFactoryImpl);
+
+}  // namespace
+
+}  // namespace internal
+
+}  // namespace broadcast_matmul
+
+}  // namespace primitive
+}  // namespace ep
+
+}  // namespace oneflow
+
+#endif  // WITH_ROCM
--- a/oneflow/core/ep/rocm/primitive/cast.hip.cpp
+++ b/oneflow/core/ep/rocm/primitive/cast.hip.cpp
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#include "oneflow/core/common/preprocessor.h"
-#include "oneflow/core/ep/include/primitive/cast.h"
-#include "oneflow/core/ep/rocm/primitive/type_seq.h"
-#include "oneflow/core/hip/elementwise.hip.h"
-#include "oneflow/core/ep/rocm/cuda_stream.h"
-
-namespace oneflow {
-
-namespace ep {
-namespace primitive {
-
-namespace {
-
-template<typename To, typename From, typename = void>
-struct CastFunctor {
-  __device__ To operator()(From from) const { return static_cast<To>(from); }
-};
-
-template<typename To>
-struct CastFunctor<To, half, typename std::enable_if<!std::is_same<To, half>::value>::type> {
-  __device__ To operator()(half from) const { return static_cast<To>(static_cast<float>(from)); }
-
-  __device__ void Apply2(To* to, const half* from) const {
-    const float2 f2 = __half22float2(*reinterpret_cast<const half2*>(from));
-    to[0] = static_cast<To>(f2.x);
-    to[1] = static_cast<To>(f2.y);
-  }
-};
-
-template<typename From>
-struct CastFunctor<half, From, typename std::enable_if<!std::is_same<From, half>::value>::type> {
-  __device__ half operator()(From from) const {
-    return static_cast<half>(static_cast<float>(from));
-  }
-
-  __device__ void Apply2(half* to, const From* from) const {
-    float2 f2;
-    f2.x = static_cast<float>(from[0]);
-    f2.y = static_cast<float>(from[1]);
-    *reinterpret_cast<half2*>(to) = __float22half2_rn(f2);
-  }
-};
-
-// #if CUDA_VERSION >= 11000
-
-// template<typename To>
-// struct CastFunctor<To, nv_bfloat16,
-//                    typename std::enable_if<!(std::is_same<To, nv_bfloat16>::value
-//                                              || std::is_same<To, half>::value)>::type> {
-//   __device__ To operator()(nv_bfloat16 from) const {
-//     return static_cast<To>(static_cast<float>(from));
-//   }
-// };
-
-// template<typename From>
-// struct CastFunctor<nv_bfloat16, From,
-//                    typename std::enable_if<!(std::is_same<From, nv_bfloat16>::value
-//                                              || std::is_same<From, half>::value)>::type> {
-//   __device__ nv_bfloat16 operator()(From from) const {
-//     return static_cast<nv_bfloat16>(static_cast<float>(from));
-//   }
-// };
-
-// #endif  // CUDA_VERSION >= 11000
-
-template<typename From, typename To>
-class CastImpl : public Cast {
- public:
-  OF_DISALLOW_COPY_AND_MOVE(CastImpl);
-  explicit CastImpl() = default;
-  ~CastImpl() override = default;
-
-  void Launch(Stream* stream, const void* from, void* to, size_t count) override {
-    auto* cuda_stream = stream->As<CudaStream>();
-    OF_CUDA_CHECK((cuda::elementwise::Unary<CastFunctor<To, From>, To, From>(
-        CastFunctor<To, From>(), count, reinterpret_cast<To*>(to),
-        reinterpret_cast<const From*>(from), cuda_stream->cuda_stream())));
-  }
-};
-
-template<typename From, typename To>
-std::unique_ptr<Cast> NewCast() {
-  return std::unique_ptr<Cast>(new CastImpl<From, To>());
-}
-
-#define CUDA_PRIMITIVE_CAST_TYPE_SEQ \
-  CUDA_PRIMITIVE_BOOL_TYPE_SEQ       \
-  CUDA_PRIMITIVE_CHAR_TYPE_SEQ       \
-  CUDA_PRIMITIVE_INT8_TYPE_SEQ       \
-  CUDA_PRIMITIVE_UINT8_TYPE_SEQ      \
-  CUDA_PRIMITIVE_INT32_TYPE_SEQ      \
-  CUDA_PRIMITIVE_UINT32_TYPE_SEQ     \
-  CUDA_PRIMITIVE_INT64_TYPE_SEQ      \
-  CUDA_PRIMITIVE_UINT64_TYPE_SEQ     \
-  CUDA_PRIMITIVE_FLOAT_TYPE_SEQ      \
-  CUDA_PRIMITIVE_DOUBLE_TYPE_SEQ     \
-  CUDA_PRIMITIVE_FLOAT16_TYPE_SEQ    \
-  CUDA_PRIMITIVE_BFLOAT16_TYPE_SEQ
-
-class CastFactoryImpl : public CastFactory {
- public:
-  OF_DISALLOW_COPY_AND_MOVE(CastFactoryImpl);
-  CastFactoryImpl() = default;
-  ~CastFactoryImpl() override = default;
-
-  std::unique_ptr<Cast> New(DataType from, DataType to) override {
-#define MAKE_NEW_CAST_ENTRY(from_pair, to_pair)                              \
-  {std::make_pair(OF_PP_PAIR_SECOND(from_pair), OF_PP_PAIR_SECOND(to_pair)), \
-   NewCast<OF_PP_PAIR_FIRST(from_pair), OF_PP_PAIR_FIRST(to_pair)>},
-
-    static const std::map<std::pair<DataType, DataType>, std::function<std::unique_ptr<Cast>()>>
-        new_cast_handle{OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(
-            MAKE_NEW_CAST_ENTRY, CUDA_PRIMITIVE_CAST_TYPE_SEQ, CUDA_PRIMITIVE_CAST_TYPE_SEQ)};
-
-#undef MAKE_NEW_CAST_ENTRY
-
-    const auto it = new_cast_handle.find(std::make_pair(from, to));
-    if (it != new_cast_handle.end()) {
-      return it->second();
-    } else {
-      return nullptr;
-    }
-  }
-};
-
-REGISTER_PRIMITIVE_FACTORY(DeviceType::kCUDA, CastFactory, CastFactoryImpl);
-
-}  // namespace
-
-}  // namespace primitive
-}  // namespace ep
-
-}  // namespace oneflow
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "oneflow/core/common/preprocessor.h"
+#include "oneflow/core/ep/include/primitive/cast.h"
+#include "oneflow/core/ep/rocm/primitive/type_seq.h"
+#include "oneflow/core/hip/elementwise.hip.h"
+#include "oneflow/core/ep/rocm/cuda_stream.h"
+
+namespace oneflow {
+
+namespace ep {
+namespace primitive {
+
+namespace {
+
+template<typename To, typename From, typename = void>
+struct CastFunctor {
+  __device__ To operator()(From from) const { return static_cast<To>(from); }
+};
+
+template<typename To>
+struct CastFunctor<To, half, typename std::enable_if<!std::is_same<To, half>::value>::type> {
+  __device__ To operator()(half from) const { return static_cast<To>(static_cast<float>(from)); }
+
+  __device__ void Apply2(To* to, const half* from) const {
+    const float2 f2 = __half22float2(*reinterpret_cast<const half2*>(from));
+    to[0] = static_cast<To>(f2.x);
+    to[1] = static_cast<To>(f2.y);
+  }
+};
+
+template<typename From>
+struct CastFunctor<half, From, typename std::enable_if<!std::is_same<From, half>::value>::type> {
+  __device__ half operator()(From from) const {
+    return static_cast<half>(static_cast<float>(from));
+  }
+
+  __device__ void Apply2(half* to, const From* from) const {
+    float2 f2;
+    f2.x = static_cast<float>(from[0]);
+    f2.y = static_cast<float>(from[1]);
+    *reinterpret_cast<half2*>(to) = __float22half2_rn(f2);
+  }
+};
+
+// #if CUDA_VERSION >= 11000
+
+// template<typename To>
+// struct CastFunctor<To, nv_bfloat16,
+//                    typename std::enable_if<!(std::is_same<To, nv_bfloat16>::value
+//                                              || std::is_same<To, half>::value)>::type> {
+//   __device__ To operator()(nv_bfloat16 from) const {
+//     return static_cast<To>(static_cast<float>(from));
+//   }
+// };
+
+// template<typename From>
+// struct CastFunctor<nv_bfloat16, From,
+//                    typename std::enable_if<!(std::is_same<From, nv_bfloat16>::value
+//                                              || std::is_same<From, half>::value)>::type> {
+//   __device__ nv_bfloat16 operator()(From from) const {
+//     return static_cast<nv_bfloat16>(static_cast<float>(from));
+//   }
+// };
+
+// #endif  // CUDA_VERSION >= 11000
+
+template<typename From, typename To>
+class CastImpl : public Cast {
+ public:
+  OF_DISALLOW_COPY_AND_MOVE(CastImpl);
+  explicit CastImpl() = default;
+  ~CastImpl() override = default;
+
+  void Launch(Stream* stream, const void* from, void* to, size_t count) override {
+    auto* cuda_stream = stream->As<CudaStream>();
+    OF_CUDA_CHECK((cuda::elementwise::Unary<CastFunctor<To, From>, To, From>(
+        CastFunctor<To, From>(), count, reinterpret_cast<To*>(to),
+        reinterpret_cast<const From*>(from), cuda_stream->cuda_stream())));
+  }
+};
+
+template<typename From, typename To>
+std::unique_ptr<Cast> NewCast() {
+  return std::unique_ptr<Cast>(new CastImpl<From, To>());
+}
+
+#define CUDA_PRIMITIVE_CAST_TYPE_SEQ \
+  CUDA_PRIMITIVE_BOOL_TYPE_SEQ       \
+  CUDA_PRIMITIVE_CHAR_TYPE_SEQ       \
+  CUDA_PRIMITIVE_INT8_TYPE_SEQ       \
+  CUDA_PRIMITIVE_UINT8_TYPE_SEQ      \
+  CUDA_PRIMITIVE_INT32_TYPE_SEQ      \
+  CUDA_PRIMITIVE_UINT32_TYPE_SEQ     \
+  CUDA_PRIMITIVE_INT64_TYPE_SEQ      \
+  CUDA_PRIMITIVE_UINT64_TYPE_SEQ     \
+  CUDA_PRIMITIVE_FLOAT_TYPE_SEQ      \
+  CUDA_PRIMITIVE_DOUBLE_TYPE_SEQ     \
+  CUDA_PRIMITIVE_FLOAT16_TYPE_SEQ    \
+  CUDA_PRIMITIVE_BFLOAT16_TYPE_SEQ
+
+class CastFactoryImpl : public CastFactory {
+ public:
+  OF_DISALLOW_COPY_AND_MOVE(CastFactoryImpl);
+  CastFactoryImpl() = default;
+  ~CastFactoryImpl() override = default;
+
+  std::unique_ptr<Cast> New(DataType from, DataType to) override {
+#define MAKE_NEW_CAST_ENTRY(from_pair, to_pair)                              \
+  {std::make_pair(OF_PP_PAIR_SECOND(from_pair), OF_PP_PAIR_SECOND(to_pair)), \
+   NewCast<OF_PP_PAIR_FIRST(from_pair), OF_PP_PAIR_FIRST(to_pair)>},
+
+    static const std::map<std::pair<DataType, DataType>, std::function<std::unique_ptr<Cast>()>>
+        new_cast_handle{OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(
+            MAKE_NEW_CAST_ENTRY, CUDA_PRIMITIVE_CAST_TYPE_SEQ, CUDA_PRIMITIVE_CAST_TYPE_SEQ)};
+
+#undef MAKE_NEW_CAST_ENTRY
+
+    const auto it = new_cast_handle.find(std::make_pair(from, to));
+    if (it != new_cast_handle.end()) {
+      return it->second();
+    } else {
+      return nullptr;
+    }
+  }
+};
+
+REGISTER_PRIMITIVE_FACTORY(DeviceType::kCUDA, CastFactory, CastFactoryImpl);
+
+}  // namespace
+
+}  // namespace primitive
+}  // namespace ep
+
+}  // namespace oneflow
--- a/oneflow/core/ep/rocm/primitive/constant_pad.hip.cpp
+++ b/oneflow/core/ep/rocm/primitive/constant_pad.hip.cpp
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#include "oneflow/core/ep/include/primitive/constant_pad.h"
-#include "oneflow/core/ep/common/primitive/constant_pad.h"
-#include "oneflow/core/ep/rocm/primitive/type_seq.h"
-#include "oneflow/core/ep/rocm/cuda_stream.h"
-#include <hip/hip_runtime.h>
-
-namespace oneflow {
-
-namespace ep {
-
-namespace primitive {
-
-namespace {
-
-template<size_t num_dims, typename IndexType, typename StorageType>
-__global__ void ConstantPadKernel(ConstantPadParams<num_dims, IndexType> params,
-                                  StorageType packed_pad_val) {
-  const StorageType* src = reinterpret_cast<const StorageType*>(params.src);
-  StorageType* dst = reinterpret_cast<StorageType*>(params.dst);
-  IndexType src_index[num_dims];
-  IndexType dst_index[num_dims];
-  CUDA_1D_KERNEL_LOOP_T(IndexType, linear_index, params.elem_cnt) {
-    params.dst_index_helper.OffsetToNdIndex(linear_index, dst_index);
-    bool if_pad = false;
-#pragma unroll
-    for (int i = 0; i < num_dims; i++) {
-      if (dst_index[i] >= params.valid_start[i] && dst_index[i] < params.valid_end[i]) {
-        src_index[i] = dst_index[i] - params.valid_start[i];
-      } else {
-        if_pad = true;
-        break;
-      }
-    }
-    StorageType dst_val = packed_pad_val;
-    if (!if_pad) {
-      const IndexType src_offset = params.src_index_helper.NdIndexToOffset(src_index);
-      dst_val = src[src_offset];
-    }
-    dst[linear_index] = dst_val;
-  }
-}
-
-template<>
-half GetValue<half>(Scalar value) {
-  return static_cast<half>(GetValue<float>(value));
-}
-
-// #if CUDA_VERSION >= 11000
-
-// template<>
-// nv_bfloat16 GetValue<nv_bfloat16>(Scalar value) {
-//   return static_cast<nv_bfloat16>(GetValue<float>(value));
-// }
-
-// #endif  // CUDA_VERSION >= 11000
-
-template<size_t num_dims, typename IndexType, typename StorageType>
-void LaunchKernel(Stream* stream, ConstantPadParams<num_dims, IndexType> params,
-                  StorageType packed_pad_val, size_t elem_cnt) {
-  stream->As<CudaStream>()->LaunchKernelDefaultWaves(
-      (ConstantPadKernel<num_dims, IndexType, StorageType>), elem_cnt, params, packed_pad_val);
-}
-
-template<size_t num_dims, typename IndexType, typename StorageType>
-void LaunchKernel(Stream* stream, void* dst, const int64_t* dst_dims, const void* src,
-                  const int64_t* src_dims, const int64_t* padding_before,
-                  const int64_t* padding_after, StorageType packed_pad_val, size_t elem_cnt) {
-  ConstantPadParams<num_dims, IndexType> params;
-  params.dst_index_helper = OffsetToIndexCalculator<IndexType, num_dims>(dst_dims);
-  params.src_index_helper = NdIndexOffsetHelper<IndexType, num_dims>(src_dims);
-  params.dst = dst;
-  params.src = src;
-  for (int i = 0; i < num_dims; i++) {
-    params.valid_start[i] = padding_before[i];
-    params.valid_end[i] = dst_dims[i] - padding_after[i];
-  }
-  params.elem_cnt = elem_cnt;
-  LaunchKernel<num_dims, IndexType, StorageType>(stream, params, packed_pad_val, elem_cnt);
-}
-
-template<size_t num_dims, typename StorageType>
-void DispatchIndexType(Stream* stream, void* dst, const int64_t* dst_dims, const void* src,
-                       const int64_t* src_dims, const int64_t* padding_before,
-                       const int64_t* padding_after, StorageType packed_pad_val, size_t elem_cnt) {
-  if (elem_cnt < GetMaxVal<int32_t>()) {
-    LaunchKernel<num_dims, int32_t, StorageType>(stream, dst, dst_dims, src, src_dims,
-                                                 padding_before, padding_after, packed_pad_val,
-                                                 elem_cnt);
-  } else {
-    LaunchKernel<num_dims, int64_t, StorageType>(stream, dst, dst_dims, src, src_dims,
-                                                 padding_before, padding_after, packed_pad_val,
-                                                 elem_cnt);
-  }
-}
-
-template<size_t num_dims, typename T>
-void DispatchPackSize(Stream* stream, void* dst, int64_t* dst_dims, const void* src,
-                      int64_t* src_dims, int64_t* padding_before, int64_t* padding_after,
-                      T pad_val) {
-  constexpr int32_t max_packsize = GetMaxPackSize<T>();
-  size_t launch_pack_size = GetLaunchPackSize<max_packsize>(num_dims, dst, dst_dims, src, src_dims,
-                                                            padding_before, padding_after);
-
-  dst_dims[num_dims - 1] /= launch_pack_size;
-  src_dims[num_dims - 1] /= launch_pack_size;
-  padding_before[num_dims - 1] /= launch_pack_size;
-  padding_after[num_dims - 1] /= launch_pack_size;
-
-  size_t elem_cnt = 1;
-  for (int i = 0; i < num_dims; i++) { elem_cnt *= dst_dims[i]; }
-  if (launch_pack_size == 1) {
-    Pack<T, 1> packed_pad_val(pad_val);
-    DispatchIndexType<num_dims, PackType<T, 1>>(stream, dst, dst_dims, src, src_dims,
-                                                padding_before, padding_after,
-                                                packed_pad_val.storage, elem_cnt);
-  } else if (launch_pack_size == 2) {
-    Pack<T, 2> packed_pad_val(pad_val);
-    DispatchIndexType<num_dims, PackType<T, 2>>(stream, dst, dst_dims, src, src_dims,
-                                                padding_before, padding_after,
-                                                packed_pad_val.storage, elem_cnt);
-  } else if (launch_pack_size == 4) {
-    Pack<T, 4> packed_pad_val(pad_val);
-    DispatchIndexType<num_dims, PackType<T, 4>>(stream, dst, dst_dims, src, src_dims,
-                                                padding_before, padding_after,
-                                                packed_pad_val.storage, elem_cnt);
-  } else if (launch_pack_size == 8) {
-    Pack<T, 8> packed_pad_val(pad_val);
-    DispatchIndexType<num_dims, PackType<T, 8>>(stream, dst, dst_dims, src, src_dims,
-                                                padding_before, padding_after,
-                                                packed_pad_val.storage, elem_cnt);
-  } else if (launch_pack_size == 16) {
-    Pack<T, 16> packed_pad_val(pad_val);
-    DispatchIndexType<num_dims, PackType<T, 16>>(stream, dst, dst_dims, src, src_dims,
-                                                 padding_before, padding_after,
-                                                 packed_pad_val.storage, elem_cnt);
-  } else {
-    UNIMPLEMENTED();
-  }
-}
-
-template<typename T>
-void LaunchWithSimplified(Stream* stream, size_t num_dims, void* dst, int64_t* dst_dims,
-                          const void* src, int64_t* src_dims, int64_t* padding_before,
-                          int64_t* padding_after, T pad_val) {
-  void (*func)(Stream* /*stream*/, void* /*dst*/, int64_t* /*dst_dims*/, const void* /*src*/,
-               int64_t* /*src_dims*/, int64_t* /*padding_before*/, int64_t* /*padding_after*/, T) =
-      nullptr;
-  if (num_dims == 1) {
-    func = DispatchPackSize<1, T>;
-  } else if (num_dims == 2) {
-    func = DispatchPackSize<2, T>;
-  } else if (num_dims == 3) {
-    func = DispatchPackSize<3, T>;
-  } else if (num_dims == 4) {
-    func = DispatchPackSize<4, T>;
-  } else if (num_dims == 5) {
-    func = DispatchPackSize<5, T>;
-  } else if (num_dims == 6) {
-    func = DispatchPackSize<6, T>;
-  } else if (num_dims == 7) {
-    func = DispatchPackSize<7, T>;
-  } else if (num_dims == 8) {
-    func = DispatchPackSize<8, T>;
-  } else {
-    UNIMPLEMENTED();
-  }
-  func(stream, dst, dst_dims, src, src_dims, padding_before, padding_after, pad_val);
-}
-
-template<typename T>
-void SimplifyThenLaunch(Stream* stream, size_t num_dims, const int64_t* src_dims, const void* src,
-                        const int64_t* padding_before, const int64_t* padding_after, T pad_val,
-                        void* dst) {
-  CHECK_LE(num_dims, kMaxNumDims);
-  int64_t simplified_dst_dims[kMaxNumDims];
-  int64_t simplified_src_dims[kMaxNumDims];
-  int64_t simplified_padding_before[kMaxNumDims];
-  int64_t simplified_padding_after[kMaxNumDims];
-  size_t simplified_num_dims = 1;
-  SimplifyPadDims(num_dims, src_dims, padding_before, padding_after, &simplified_num_dims,
-                  simplified_dst_dims, simplified_src_dims, simplified_padding_before,
-                  simplified_padding_after);
-  LaunchWithSimplified<T>(stream, simplified_num_dims, dst, simplified_dst_dims, src,
-                          simplified_src_dims, simplified_padding_before, simplified_padding_after,
-                          pad_val);
-}
-
-template<typename T>
-class ConstantPadImpl : public ConstantPad {
- public:
-  OF_DISALLOW_COPY_AND_MOVE(ConstantPadImpl);
-  ConstantPadImpl() = default;
-  ~ConstantPadImpl() override = default;
-
-  void Launch(Stream* stream, size_t num_dims, const int64_t* src_dims, const void* src,
-              const int64_t* padding_before, const int64_t* padding_after, Scalar pad_val,
-              void* dst) override {
-    SimplifyThenLaunch<T>(stream, num_dims, src_dims, src, padding_before, padding_after,
-                          GetValue<T>(pad_val), dst);
-  }
-};
-
-template<typename T>
-std::unique_ptr<ConstantPad> NewConstantPad() {
-  return std::unique_ptr<ConstantPad>(new ConstantPadImpl<T>());
-}
-
-class ConstantPadFactoryImpl : public ConstantPadFactory {
- public:
-  OF_DISALLOW_COPY_AND_MOVE(ConstantPadFactoryImpl);
-  ConstantPadFactoryImpl() = default;
-  ~ConstantPadFactoryImpl() override = default;
-
-  std::unique_ptr<ConstantPad> New(DataType data_type) override {
-#define MAKE_NEW_CONSTANT_PAD_ENTRY(type_cpp, type_proto) {type_proto, NewConstantPad<type_cpp>},
-
-    static const std::map<DataType, std::function<std::unique_ptr<ConstantPad>()>>
-        new_constant_pad_handle{
-            OF_PP_FOR_EACH_TUPLE(MAKE_NEW_CONSTANT_PAD_ENTRY, CUDA_PRIMITIVE_ALL_TYPE_SEQ)};
-
-#undef MAKE_NEW_CONSTANT_PAD_ENTRY
-
-    const auto it = new_constant_pad_handle.find(data_type);
-    if (it != new_constant_pad_handle.end()) {
-      return it->second();
-    } else {
-      return nullptr;
-    }
-  }
-};
-
-REGISTER_PRIMITIVE_FACTORY(DeviceType::kCUDA, ConstantPadFactory, ConstantPadFactoryImpl);
-
-}  // namespace
-
-}  // namespace primitive
-
-}  // namespace ep
-
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "oneflow/core/ep/include/primitive/constant_pad.h"
+#include "oneflow/core/ep/common/primitive/constant_pad.h"
+#include "oneflow/core/ep/rocm/primitive/type_seq.h"
+#include "oneflow/core/ep/rocm/cuda_stream.h"
+#include <hip/hip_runtime.h>
+
+namespace oneflow {
+
+namespace ep {
+
+namespace primitive {
+
+namespace {
+
+template<size_t num_dims, typename IndexType, typename StorageType>
+__global__ void ConstantPadKernel(ConstantPadParams<num_dims, IndexType> params,
+                                  StorageType packed_pad_val) {
+  const StorageType* src = reinterpret_cast<const StorageType*>(params.src);
+  StorageType* dst = reinterpret_cast<StorageType*>(params.dst);
+  IndexType src_index[num_dims];
+  IndexType dst_index[num_dims];
+  CUDA_1D_KERNEL_LOOP_T(IndexType, linear_index, params.elem_cnt) {
+    params.dst_index_helper.OffsetToNdIndex(linear_index, dst_index);
+    bool if_pad = false;
+#pragma unroll
+    for (int i = 0; i < num_dims; i++) {
+      if (dst_index[i] >= params.valid_start[i] && dst_index[i] < params.valid_end[i]) {
+        src_index[i] = dst_index[i] - params.valid_start[i];
+      } else {
+        if_pad = true;
+        break;
+      }
+    }
+    StorageType dst_val = packed_pad_val;
+    if (!if_pad) {
+      const IndexType src_offset = params.src_index_helper.NdIndexToOffset(src_index);
+      dst_val = src[src_offset];
+    }
+    dst[linear_index] = dst_val;
+  }
+}
+
+template<>
+half GetValue<half>(Scalar value) {
+  return static_cast<half>(GetValue<float>(value));
+}
+
+// #if CUDA_VERSION >= 11000
+
+// template<>
+// nv_bfloat16 GetValue<nv_bfloat16>(Scalar value) {
+//   return static_cast<nv_bfloat16>(GetValue<float>(value));
+// }
+
+// #endif  // CUDA_VERSION >= 11000
+
+template<size_t num_dims, typename IndexType, typename StorageType>
+void LaunchKernel(Stream* stream, ConstantPadParams<num_dims, IndexType> params,
+                  StorageType packed_pad_val, size_t elem_cnt) {
+  stream->As<CudaStream>()->LaunchKernelDefaultWaves(
+      (ConstantPadKernel<num_dims, IndexType, StorageType>), elem_cnt, params, packed_pad_val);
+}
+
+template<size_t num_dims, typename IndexType, typename StorageType>
+void LaunchKernel(Stream* stream, void* dst, const int64_t* dst_dims, const void* src,
+                  const int64_t* src_dims, const int64_t* padding_before,
+                  const int64_t* padding_after, StorageType packed_pad_val, size_t elem_cnt) {
+  ConstantPadParams<num_dims, IndexType> params;
+  params.dst_index_helper = OffsetToIndexCalculator<IndexType, num_dims>(dst_dims);
+  params.src_index_helper = NdIndexOffsetHelper<IndexType, num_dims>(src_dims);
+  params.dst = dst;
+  params.src = src;
+  for (int i = 0; i < num_dims; i++) {
+    params.valid_start[i] = padding_before[i];
+    params.valid_end[i] = dst_dims[i] - padding_after[i];
+  }
+  params.elem_cnt = elem_cnt;
+  LaunchKernel<num_dims, IndexType, StorageType>(stream, params, packed_pad_val, elem_cnt);
+}
+
+template<size_t num_dims, typename StorageType>
+void DispatchIndexType(Stream* stream, void* dst, const int64_t* dst_dims, const void* src,
+                       const int64_t* src_dims, const int64_t* padding_before,
+                       const int64_t* padding_after, StorageType packed_pad_val, size_t elem_cnt) {
+  if (elem_cnt < GetMaxVal<int32_t>()) {
+    LaunchKernel<num_dims, int32_t, StorageType>(stream, dst, dst_dims, src, src_dims,
+                                                 padding_before, padding_after, packed_pad_val,
+                                                 elem_cnt);
+  } else {
+    LaunchKernel<num_dims, int64_t, StorageType>(stream, dst, dst_dims, src, src_dims,
+                                                 padding_before, padding_after, packed_pad_val,
+                                                 elem_cnt);
+  }
+}
+
+template<size_t num_dims, typename T>
+void DispatchPackSize(Stream* stream, void* dst, int64_t* dst_dims, const void* src,
+                      int64_t* src_dims, int64_t* padding_before, int64_t* padding_after,
+                      T pad_val) {
+  constexpr int32_t max_packsize = GetMaxPackSize<T>();
+  size_t launch_pack_size = GetLaunchPackSize<max_packsize>(num_dims, dst, dst_dims, src, src_dims,
+                                                            padding_before, padding_after);
+
+  dst_dims[num_dims - 1] /= launch_pack_size;
+  src_dims[num_dims - 1] /= launch_pack_size;
+  padding_before[num_dims - 1] /= launch_pack_size;
+  padding_after[num_dims - 1] /= launch_pack_size;
+
+  size_t elem_cnt = 1;
+  for (int i = 0; i < num_dims; i++) { elem_cnt *= dst_dims[i]; }
+  if (launch_pack_size == 1) {
+    Pack<T, 1> packed_pad_val(pad_val);
+    DispatchIndexType<num_dims, PackType<T, 1>>(stream, dst, dst_dims, src, src_dims,
+                                                padding_before, padding_after,
+                                                packed_pad_val.storage, elem_cnt);
+  } else if (launch_pack_size == 2) {
+    Pack<T, 2> packed_pad_val(pad_val);
+    DispatchIndexType<num_dims, PackType<T, 2>>(stream, dst, dst_dims, src, src_dims,
+                                                padding_before, padding_after,
+                                                packed_pad_val.storage, elem_cnt);
+  } else if (launch_pack_size == 4) {
+    Pack<T, 4> packed_pad_val(pad_val);
+    DispatchIndexType<num_dims, PackType<T, 4>>(stream, dst, dst_dims, src, src_dims,
+                                                padding_before, padding_after,
+                                                packed_pad_val.storage, elem_cnt);
+  } else if (launch_pack_size == 8) {
+    Pack<T, 8> packed_pad_val(pad_val);
+    DispatchIndexType<num_dims, PackType<T, 8>>(stream, dst, dst_dims, src, src_dims,
+                                                padding_before, padding_after,
+                                                packed_pad_val.storage, elem_cnt);
+  } else if (launch_pack_size == 16) {
+    Pack<T, 16> packed_pad_val(pad_val);
+    DispatchIndexType<num_dims, PackType<T, 16>>(stream, dst, dst_dims, src, src_dims,
+                                                 padding_before, padding_after,
+                                                 packed_pad_val.storage, elem_cnt);
+  } else {
+    UNIMPLEMENTED();
+  }
+}
+
+template<typename T>
+void LaunchWithSimplified(Stream* stream, size_t num_dims, void* dst, int64_t* dst_dims,
+                          const void* src, int64_t* src_dims, int64_t* padding_before,
+                          int64_t* padding_after, T pad_val) {
+  void (*func)(Stream* /*stream*/, void* /*dst*/, int64_t* /*dst_dims*/, const void* /*src*/,
+               int64_t* /*src_dims*/, int64_t* /*padding_before*/, int64_t* /*padding_after*/, T) =
+      nullptr;
+  if (num_dims == 1) {
+    func = DispatchPackSize<1, T>;
+  } else if (num_dims == 2) {
+    func = DispatchPackSize<2, T>;
+  } else if (num_dims == 3) {
+    func = DispatchPackSize<3, T>;
+  } else if (num_dims == 4) {
+    func = DispatchPackSize<4, T>;
+  } else if (num_dims == 5) {
+    func = DispatchPackSize<5, T>;
+  } else if (num_dims == 6) {
+    func = DispatchPackSize<6, T>;
+  } else if (num_dims == 7) {
+    func = DispatchPackSize<7, T>;
+  } else if (num_dims == 8) {
+    func = DispatchPackSize<8, T>;
+  } else {
+    UNIMPLEMENTED();
+  }
+  func(stream, dst, dst_dims, src, src_dims, padding_before, padding_after, pad_val);
+}
+
+template<typename T>
+void SimplifyThenLaunch(Stream* stream, size_t num_dims, const int64_t* src_dims, const void* src,
+                        const int64_t* padding_before, const int64_t* padding_after, T pad_val,
+                        void* dst) {
+  CHECK_LE(num_dims, kMaxNumDims);
+  int64_t simplified_dst_dims[kMaxNumDims];
+  int64_t simplified_src_dims[kMaxNumDims];
+  int64_t simplified_padding_before[kMaxNumDims];
+  int64_t simplified_padding_after[kMaxNumDims];
+  size_t simplified_num_dims = 1;
+  SimplifyPadDims(num_dims, src_dims, padding_before, padding_after, &simplified_num_dims,
+                  simplified_dst_dims, simplified_src_dims, simplified_padding_before,
+                  simplified_padding_after);
+  LaunchWithSimplified<T>(stream, simplified_num_dims, dst, simplified_dst_dims, src,
+                          simplified_src_dims, simplified_padding_before, simplified_padding_after,
+                          pad_val);
+}
+
+template<typename T>
+class ConstantPadImpl : public ConstantPad {
+ public:
+  OF_DISALLOW_COPY_AND_MOVE(ConstantPadImpl);
+  ConstantPadImpl() = default;
+  ~ConstantPadImpl() override = default;
+
+  void Launch(Stream* stream, size_t num_dims, const int64_t* src_dims, const void* src,
+              const int64_t* padding_before, const int64_t* padding_after, Scalar pad_val,
+              void* dst) override {
+    SimplifyThenLaunch<T>(stream, num_dims, src_dims, src, padding_before, padding_after,
+                          GetValue<T>(pad_val), dst);
+  }
+};
+
+template<typename T>
+std::unique_ptr<ConstantPad> NewConstantPad() {
+  return std::unique_ptr<ConstantPad>(new ConstantPadImpl<T>());
+}
+
+class ConstantPadFactoryImpl : public ConstantPadFactory {
+ public:
+  OF_DISALLOW_COPY_AND_MOVE(ConstantPadFactoryImpl);
+  ConstantPadFactoryImpl() = default;
+  ~ConstantPadFactoryImpl() override = default;
+
+  std::unique_ptr<ConstantPad> New(DataType data_type) override {
+#define MAKE_NEW_CONSTANT_PAD_ENTRY(type_cpp, type_proto) {type_proto, NewConstantPad<type_cpp>},
+
+    static const std::map<DataType, std::function<std::unique_ptr<ConstantPad>()>>
+        new_constant_pad_handle{
+            OF_PP_FOR_EACH_TUPLE(MAKE_NEW_CONSTANT_PAD_ENTRY, CUDA_PRIMITIVE_ALL_TYPE_SEQ)};
+
+#undef MAKE_NEW_CONSTANT_PAD_ENTRY
+
+    const auto it = new_constant_pad_handle.find(data_type);
+    if (it != new_constant_pad_handle.end()) {
+      return it->second();
+    } else {
+      return nullptr;
+    }
+  }
+};
+
+REGISTER_PRIMITIVE_FACTORY(DeviceType::kCUDA, ConstantPadFactory, ConstantPadFactoryImpl);
+
+}  // namespace
+
+}  // namespace primitive
+
+}  // namespace ep
+
 }  // namespace oneflow
\ No newline at end of file
--- a/oneflow/core/ep/rocm/primitive/copy_nd.hip.cpp
+++ b/oneflow/core/ep/rocm/primitive/copy_nd.hip.cpp
-#include "hip/hip_runtime.h"
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-#include "oneflow/core/ep/include/primitive/copy_nd.h"
-#include "oneflow/core/ep/common/primitive/copy_nd.h"
-#include "oneflow/core/ep/rocm/cuda_stream.h"
-#include <hip/hip_runtime.h>
-
-namespace oneflow {
-
-namespace ep {
-namespace primitive {
-
-namespace {
-
-template<size_t num_dims, size_t movement_size, typename IndexType>
-__global__ void CopyNdKernel(CopyNdKernelParams<num_dims, IndexType> params) {
-  using T = typename std::aligned_storage<movement_size, movement_size>::type;
-  const T* src = reinterpret_cast<const T*>(params.src);
-  T* dst = reinterpret_cast<T*>(params.dst);
-  IndexType copy_index[num_dims];
-  IndexType src_index[num_dims];
-  IndexType dst_index[num_dims];
-  CUDA_1D_KERNEL_LOOP_T(IndexType, i, params.count) {
-    params.copy_index_helper.OffsetToNdIndex(i, copy_index);
-#pragma unroll
-    for (size_t j = 0; j < num_dims; ++j) {
-      src_index[j] = params.src_pos[j] + copy_index[j];
-      dst_index[j] = params.dst_pos[j] + copy_index[j];
-    }
-    const IndexType src_offset = params.src_index_helper.NdIndexToOffset(src_index);
-    const IndexType dst_offset = params.dst_index_helper.NdIndexToOffset(dst_index);
-    dst[dst_offset] = src[src_offset];
-  }
-}
-
-template<size_t num_dims, size_t movement_size, typename IndexType>
-void LaunchKernel(Stream* stream, CopyNdKernelParams<num_dims, IndexType> params) {
-  hipStream_t cuda_stream = stream->As<CudaStream>()->cuda_stream();
-  CopyNdKernel<num_dims, movement_size, IndexType>
-      <<<BlocksNum4ThreadsNum(params.count), kCudaThreadsNumPerBlock, 0, cuda_stream>>>(params);
-}
-
-class CopyNdImpl : public CopyNd {
- public:
-  OF_DISALLOW_COPY_AND_MOVE(CopyNdImpl);
-  CopyNdImpl() = default;
-  ~CopyNdImpl() override = default;
-
-  void Launch(Stream* stream, DataType data_type, size_t num_dims, void* dst,
-              const int64_t* dst_dims, const int64_t* dst_pos, const void* src,
-              const int64_t* src_dims, const int64_t* src_pos,
-              const int64_t* extent) const override {
-    SimplifyThenLaunch(stream, data_type, num_dims, dst, dst_dims, dst_pos, src, src_dims, src_pos,
-                       extent);
-  }
-};
-
-class CopyNdFactoryImpl : public CopyNdFactory {
- public:
-  OF_DISALLOW_COPY_AND_MOVE(CopyNdFactoryImpl);
-  CopyNdFactoryImpl() = default;
-  ~CopyNdFactoryImpl() override = default;
-
-  std::unique_ptr<CopyNd> New(size_t max_num_dims) override {
-    if (max_num_dims <= kMaxNumDims) {
-      return std::unique_ptr<CopyNd>(new CopyNdImpl());
-    } else {
-      return nullptr;
-    }
-  }
-};
-
-REGISTER_PRIMITIVE_FACTORY(DeviceType::kCUDA, CopyNdFactory, CopyNdFactoryImpl);
-
-}  // namespace
-
-}  // namespace primitive
-}  // namespace ep
-
-}  // namespace oneflow
+#include "hip/hip_runtime.h"
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+#include "oneflow/core/ep/include/primitive/copy_nd.h"
+#include "oneflow/core/ep/common/primitive/copy_nd.h"
+#include "oneflow/core/ep/rocm/cuda_stream.h"
+#include <hip/hip_runtime.h>
+
+namespace oneflow {
+
+namespace ep {
+namespace primitive {
+
+namespace {
+
+template<size_t num_dims, size_t movement_size, typename IndexType>
+__global__ void CopyNdKernel(CopyNdKernelParams<num_dims, IndexType> params) {
+  using T = typename std::aligned_storage<movement_size, movement_size>::type;
+  const T* src = reinterpret_cast<const T*>(params.src);
+  T* dst = reinterpret_cast<T*>(params.dst);
+  IndexType copy_index[num_dims];
+  IndexType src_index[num_dims];
+  IndexType dst_index[num_dims];
+  CUDA_1D_KERNEL_LOOP_T(IndexType, i, params.count) {
+    params.copy_index_helper.OffsetToNdIndex(i, copy_index);
+#pragma unroll
+    for (size_t j = 0; j < num_dims; ++j) {
+      src_index[j] = params.src_pos[j] + copy_index[j];
+      dst_index[j] = params.dst_pos[j] + copy_index[j];
+    }
+    const IndexType src_offset = params.src_index_helper.NdIndexToOffset(src_index);
+    const IndexType dst_offset = params.dst_index_helper.NdIndexToOffset(dst_index);
+    dst[dst_offset] = src[src_offset];
+  }
+}
+
+template<size_t num_dims, size_t movement_size, typename IndexType>
+void LaunchKernel(Stream* stream, CopyNdKernelParams<num_dims, IndexType> params) {
+  hipStream_t cuda_stream = stream->As<CudaStream>()->cuda_stream();
+  CopyNdKernel<num_dims, movement_size, IndexType>
+      <<<BlocksNum4ThreadsNum(params.count), kCudaThreadsNumPerBlock, 0, cuda_stream>>>(params);
+}
+
+class CopyNdImpl : public CopyNd {
+ public:
+  OF_DISALLOW_COPY_AND_MOVE(CopyNdImpl);
+  CopyNdImpl() = default;
+  ~CopyNdImpl() override = default;
+
+  void Launch(Stream* stream, DataType data_type, size_t num_dims, void* dst,
+              const int64_t* dst_dims, const int64_t* dst_pos, const void* src,
+              const int64_t* src_dims, const int64_t* src_pos,
+              const int64_t* extent) const override {
+    SimplifyThenLaunch(stream, data_type, num_dims, dst, dst_dims, dst_pos, src, src_dims, src_pos,
+                       extent);
+  }
+};
+
+class CopyNdFactoryImpl : public CopyNdFactory {
+ public:
+  OF_DISALLOW_COPY_AND_MOVE(CopyNdFactoryImpl);
+  CopyNdFactoryImpl() = default;
+  ~CopyNdFactoryImpl() override = default;
+
+  std::unique_ptr<CopyNd> New(size_t max_num_dims) override {
+    if (max_num_dims <= kMaxNumDims) {
+      return std::unique_ptr<CopyNd>(new CopyNdImpl());
+    } else {
+      return nullptr;
+    }
+  }
+};
+
+REGISTER_PRIMITIVE_FACTORY(DeviceType::kCUDA, CopyNdFactory, CopyNdFactoryImpl);
+
+}  // namespace
+
+}  // namespace primitive
+}  // namespace ep
+
+}  // namespace oneflow
--- a/oneflow/core/ep/rocm/primitive/elementwise_unary.hip.cpp
+++ b/oneflow/core/ep/rocm/primitive/elementwise_unary.hip.cpp
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#include "oneflow/core/ep/common/primitive/elementwise_unary.h"
-#include "oneflow/core/ep/rocm/primitive/unary_functor.hip.h"
-
-namespace oneflow {
-
-namespace ep {
-namespace primitive {
-
-namespace {
-
-template<UnaryOp unary_op, typename Src, typename Dst>
-class ElementwiseUnaryImpl : public ElementwiseUnary {
- public:
-  OF_DISALLOW_COPY_AND_MOVE(ElementwiseUnaryImpl);
-  ElementwiseUnaryImpl(Scalar attr0, Scalar attr1) : attr0(attr0), attr1(attr1) {}
-  ~ElementwiseUnaryImpl() override = default;
-
-  void Launch(Stream* stream, const void* src, void* dst, size_t count) override {
-    auto* cuda_stream = stream->As<CudaStream>();
-    auto functor = UnaryFunctor<DeviceType::kCUDA, unary_op, Dst, Src>(attr0, attr1);
-    OF_CUDA_CHECK((cuda::elementwise::Unary<decltype(functor), Dst, Src>(
-        functor, count, reinterpret_cast<Dst*>(dst), reinterpret_cast<const Src*>(src),
-        cuda_stream->cuda_stream())));
-  }
-
- protected:
-  Scalar attr0, attr1;
-};
-
-template<UnaryOp unary_op, typename Src, typename Dst>
-std::unique_ptr<ElementwiseUnary> NewElementwiseUnary(Scalar attr0, Scalar attr1) {
-  return std::unique_ptr<ElementwiseUnary>(
-      new ElementwiseUnaryImpl<unary_op, Src, Dst>(attr0, attr1));
-}
-
-class ElementwiseUnaryFactoryImpl : public ElementwiseUnaryFactory {
- public:
-  OF_DISALLOW_COPY_AND_MOVE(ElementwiseUnaryFactoryImpl);
-  ElementwiseUnaryFactoryImpl() = default;
-  ~ElementwiseUnaryFactoryImpl() override = default;
-
-  std::unique_ptr<ElementwiseUnary> New(UnaryOp unary_op, DataType src_type,
-                                        DataType dst_dtype) override {
-    return New(unary_op, src_type, dst_dtype, Scalar(), Scalar());
-  }
-
-  std::unique_ptr<ElementwiseUnary> New(UnaryOp unary_op, DataType src_type, DataType dst_dtype,
-                                        Scalar attr0) override {
-    return New(unary_op, src_type, dst_dtype, attr0, Scalar());
-  }
-
-  std::unique_ptr<ElementwiseUnary> New(UnaryOp unary_op, DataType src_type, DataType dst_dtype,
-                                        Scalar attr0, Scalar attr1) override {
-#define MAKE_NEW_SAME_DTYPE_ELEMENTWISE_UNARY_ENTRY(unary_op, dtype_pair)                   \
-  {std::make_tuple(unary_op, OF_PP_PAIR_SECOND(dtype_pair), OF_PP_PAIR_SECOND(dtype_pair)), \
-   NewElementwiseUnary<unary_op, OF_PP_PAIR_FIRST(dtype_pair), OF_PP_PAIR_FIRST(dtype_pair)>},
-
-#define MAKE_NEW_DIFFERENT_DTYPE_ELEMENTWISE_UNARY_ENTRY(unary_op, src_type_pair, dst_dtype_pair)  \
-  {std::make_tuple(unary_op, OF_PP_PAIR_SECOND(src_type_pair), OF_PP_PAIR_SECOND(dst_dtype_pair)), \
-   NewElementwiseUnary<unary_op, OF_PP_PAIR_FIRST(src_type_pair),                                  \
-                       OF_PP_PAIR_FIRST(dst_dtype_pair)>},
-
-    static const std::map<std::tuple<UnaryOp, DataType, DataType>,
-                          std::function<std::unique_ptr<ElementwiseUnary>(Scalar, Scalar)>>
-        new_elementwise_unary_handle{
-            // For All Type OP
-            OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(MAKE_NEW_SAME_DTYPE_ELEMENTWISE_UNARY_ENTRY,
-                                             UNARY_MATH_OP_SEQ, CUDA_PRIMITIVE_ALL_TYPE_SEQ)
-            // For Float Type OP
-            OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(MAKE_NEW_SAME_DTYPE_ELEMENTWISE_UNARY_ENTRY,
-                                             UNARY_FLOATING_MATH_OP_SEQ,
-                                             CUDA_PRIMITIVE_FLOATING_TYPE_SEQ)
-
-            // For Utils OP
-            OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(MAKE_NEW_DIFFERENT_DTYPE_ELEMENTWISE_UNARY_ENTRY,
-                                             UNARY_UTILS_OP_SEQ, UTIL_OPS_DATA_TYPE_SEQ,
-                                             CUDA_PRIMITIVE_BOOL_TYPE_SEQ)
-
-            // For Logical OP
-            OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(MAKE_NEW_DIFFERENT_DTYPE_ELEMENTWISE_UNARY_ENTRY,
-                                             UNARY_LOGICAL_OP_SEQ, CUDA_PRIMITIVE_ALL_TYPE_SEQ,
-                                             CUDA_PRIMITIVE_BOOL_TYPE_SEQ)};
-
-#undef MAKE_NEW_DIFFERENT_DTYPE_ELEMENTWISE_UNARY_ENTRY
-
-#undef MAKE_NEW_SAME_DTYPE_ELEMENTWISE_UNARY_ENTRY
-    const auto it =
-        new_elementwise_unary_handle.find(std::make_tuple(unary_op, src_type, dst_dtype));
-    if (it != new_elementwise_unary_handle.end()) {
-      return it->second(attr0, attr1);
-    } else {
-      return nullptr;
-    }
-  }
-};
-
-REGISTER_PRIMITIVE_FACTORY(DeviceType::kCUDA, ElementwiseUnaryFactory, ElementwiseUnaryFactoryImpl);
-
-}  // namespace
-}  // namespace primitive
-}  // namespace ep
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "oneflow/core/ep/common/primitive/elementwise_unary.h"
+#include "oneflow/core/ep/rocm/primitive/unary_functor.hip.h"
+
+namespace oneflow {
+
+namespace ep {
+namespace primitive {
+
+namespace {
+
+template<UnaryOp unary_op, typename Src, typename Dst>
+class ElementwiseUnaryImpl : public ElementwiseUnary {
+ public:
+  OF_DISALLOW_COPY_AND_MOVE(ElementwiseUnaryImpl);
+  ElementwiseUnaryImpl(Scalar attr0, Scalar attr1) : attr0(attr0), attr1(attr1) {}
+  ~ElementwiseUnaryImpl() override = default;
+
+  void Launch(Stream* stream, const void* src, void* dst, size_t count) override {
+    auto* cuda_stream = stream->As<CudaStream>();
+    auto functor = UnaryFunctor<DeviceType::kCUDA, unary_op, Dst, Src>(attr0, attr1);
+    OF_CUDA_CHECK((cuda::elementwise::Unary<decltype(functor), Dst, Src>(
+        functor, count, reinterpret_cast<Dst*>(dst), reinterpret_cast<const Src*>(src),
+        cuda_stream->cuda_stream())));
+  }
+
+ protected:
+  Scalar attr0, attr1;
+};
+
+template<UnaryOp unary_op, typename Src, typename Dst>
+std::unique_ptr<ElementwiseUnary> NewElementwiseUnary(Scalar attr0, Scalar attr1) {
+  return std::unique_ptr<ElementwiseUnary>(
+      new ElementwiseUnaryImpl<unary_op, Src, Dst>(attr0, attr1));
+}
+
+class ElementwiseUnaryFactoryImpl : public ElementwiseUnaryFactory {
+ public:
+  OF_DISALLOW_COPY_AND_MOVE(ElementwiseUnaryFactoryImpl);
+  ElementwiseUnaryFactoryImpl() = default;
+  ~ElementwiseUnaryFactoryImpl() override = default;
+
+  std::unique_ptr<ElementwiseUnary> New(UnaryOp unary_op, DataType src_type,
+                                        DataType dst_dtype) override {
+    return New(unary_op, src_type, dst_dtype, Scalar(), Scalar());
+  }
+
+  std::unique_ptr<ElementwiseUnary> New(UnaryOp unary_op, DataType src_type, DataType dst_dtype,
+                                        Scalar attr0) override {
+    return New(unary_op, src_type, dst_dtype, attr0, Scalar());
+  }
+
+  std::unique_ptr<ElementwiseUnary> New(UnaryOp unary_op, DataType src_type, DataType dst_dtype,
+                                        Scalar attr0, Scalar attr1) override {
+#define MAKE_NEW_SAME_DTYPE_ELEMENTWISE_UNARY_ENTRY(unary_op, dtype_pair)                   \
+  {std::make_tuple(unary_op, OF_PP_PAIR_SECOND(dtype_pair), OF_PP_PAIR_SECOND(dtype_pair)), \
+   NewElementwiseUnary<unary_op, OF_PP_PAIR_FIRST(dtype_pair), OF_PP_PAIR_FIRST(dtype_pair)>},
+
+#define MAKE_NEW_DIFFERENT_DTYPE_ELEMENTWISE_UNARY_ENTRY(unary_op, src_type_pair, dst_dtype_pair)  \
+  {std::make_tuple(unary_op, OF_PP_PAIR_SECOND(src_type_pair), OF_PP_PAIR_SECOND(dst_dtype_pair)), \
+   NewElementwiseUnary<unary_op, OF_PP_PAIR_FIRST(src_type_pair),                                  \
+                       OF_PP_PAIR_FIRST(dst_dtype_pair)>},
+
+    static const std::map<std::tuple<UnaryOp, DataType, DataType>,
+                          std::function<std::unique_ptr<ElementwiseUnary>(Scalar, Scalar)>>
+        new_elementwise_unary_handle{
+            // For All Type OP
+            OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(MAKE_NEW_SAME_DTYPE_ELEMENTWISE_UNARY_ENTRY,
+                                             UNARY_MATH_OP_SEQ, CUDA_PRIMITIVE_ALL_TYPE_SEQ)
+            // For Float Type OP
+            OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(MAKE_NEW_SAME_DTYPE_ELEMENTWISE_UNARY_ENTRY,
+                                             UNARY_FLOATING_MATH_OP_SEQ,
+                                             CUDA_PRIMITIVE_FLOATING_TYPE_SEQ)
+
+            // For Utils OP
+            OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(MAKE_NEW_DIFFERENT_DTYPE_ELEMENTWISE_UNARY_ENTRY,
+                                             UNARY_UTILS_OP_SEQ, UTIL_OPS_DATA_TYPE_SEQ,
+                                             CUDA_PRIMITIVE_BOOL_TYPE_SEQ)
+
+            // For Logical OP
+            OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(MAKE_NEW_DIFFERENT_DTYPE_ELEMENTWISE_UNARY_ENTRY,
+                                             UNARY_LOGICAL_OP_SEQ, CUDA_PRIMITIVE_ALL_TYPE_SEQ,
+                                             CUDA_PRIMITIVE_BOOL_TYPE_SEQ)};
+
+#undef MAKE_NEW_DIFFERENT_DTYPE_ELEMENTWISE_UNARY_ENTRY
+
+#undef MAKE_NEW_SAME_DTYPE_ELEMENTWISE_UNARY_ENTRY
+    const auto it =
+        new_elementwise_unary_handle.find(std::make_tuple(unary_op, src_type, dst_dtype));
+    if (it != new_elementwise_unary_handle.end()) {
+      return it->second(attr0, attr1);
+    } else {
+      return nullptr;
+    }
+  }
+};
+
+REGISTER_PRIMITIVE_FACTORY(DeviceType::kCUDA, ElementwiseUnaryFactory, ElementwiseUnaryFactoryImpl);
+
+}  // namespace
+}  // namespace primitive
+}  // namespace ep
 }  // namespace oneflow
\ No newline at end of file
--- a/oneflow/core/ep/rocm/primitive/fill.hip.cpp
+++ b/oneflow/core/ep/rocm/primitive/fill.hip.cpp
-#include "hip/hip_runtime.h"
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#include "oneflow/core/ep/include/primitive/fill.h"
-#include "oneflow/core/ep/rocm/primitive/type_seq.h"
-#include "oneflow/core/ep/rocm/cuda_stream.h"
-
-namespace oneflow {
-
-namespace ep {
-namespace primitive {
-
-namespace {
-
-template<size_t size>
-using Storage = typename std::aligned_storage<size, size>::type;
-
-template<typename T, size_t pack>
-union Pack {
-  static constexpr size_t size = sizeof(T) * pack;
-  explicit __device__ __host__ Pack(T value) {
-    static_assert(sizeof(Pack) == size, "");
-    static_assert(alignof(Pack) == size, "");
-#pragma unroll
-    for (size_t i = 0; i < pack; ++i) { elem[i] = value; }
-  }
-  T elem[pack];
-  Storage<size> storage;
-};
-
-template<typename T, size_t pack>
-__global__ void FillGpu(T* dst, T value, size_t count) {
-  const size_t pack_count = count / pack;
-  Pack<T, pack> pack_value(value);
-  auto* pack_dst = reinterpret_cast<decltype(pack_value.storage)*>(dst);
-  CUDA_1D_KERNEL_LOOP_T(size_t, i, pack_count) { pack_dst[i] = pack_value.storage; }
-  T* tail_dst = dst + pack_count * pack;
-  const size_t tail_count = count - pack_count * pack;
-  CUDA_1D_KERNEL_LOOP_T(size_t, i, tail_count) { tail_dst[i] = value; }
-}
-
-template<typename T>
-T GetValue(Scalar value) {
-  return value.Value<T>();
-}
-
-template<>
-half GetValue<half>(Scalar value) {
-  return static_cast<half>(GetValue<float>(value));
-}
-
-// #if CUDA_VERSION >= 11000
-
-// template<>
-// nv_bfloat16 GetValue<nv_bfloat16>(Scalar value) {
-//   return static_cast<nv_bfloat16>(GetValue<float>(value));
-// }
-
-// #endif  // CUDA_VERSION >= 11000
-
-template<typename T, size_t pack>
-typename std::enable_if<(pack != 0), void>::type LaunchPackFill(hipStream_t stream, T* dst,
-                                                                T value, size_t count) {
-  FillGpu<T, pack>
-      <<<BlocksNum4ThreadsNum(count), kCudaThreadsNumPerBlock, 0, stream>>>(dst, value, count);
-}
-
-template<typename T, size_t pack>
-typename std::enable_if<(pack == 0), void>::type LaunchPackFill(hipStream_t stream, T* dst,
-                                                                T value, size_t count) {
-  LOG(FATAL) << "wrong alignment";
-}
-
-template<typename T>
-void LaunchFill(hipStream_t stream, T* dst, T value, size_t count) {
-  auto uintptr = reinterpret_cast<std::uintptr_t>(dst);
-  if (uintptr % 16 == 0) {
-    LaunchPackFill<T, 16 / sizeof(T)>(stream, dst, value, count);
-  } else if (uintptr % 8 == 0) {
-    LaunchPackFill<T, 8 / sizeof(T)>(stream, dst, value, count);
-  } else if (uintptr % 4 == 0) {
-    LaunchPackFill<T, 4 / sizeof(T)>(stream, dst, value, count);
-  } else if (uintptr % 2 == 0) {
-    LaunchPackFill<T, 2 / sizeof(T)>(stream, dst, value, count);
-  } else {
-    LaunchPackFill<T, 1 / sizeof(T)>(stream, dst, value, count);
-  }
-}
-
-template<typename T>
-class FillImpl : public Fill {
- public:
-  OF_DISALLOW_COPY_AND_MOVE(FillImpl);
-  FillImpl() = default;
-  ~FillImpl() override = default;
-
-  void Launch(Stream* stream, void* dst, Scalar value, size_t count) override {
-    hipStream_t cuda_stream = stream->As<CudaStream>()->cuda_stream();
-    LaunchFill<T>(cuda_stream, reinterpret_cast<T*>(dst), GetValue<T>(value), count);
-  }
-};
-
-template<typename T>
-std::unique_ptr<Fill> NewFill() {
-  return std::unique_ptr<Fill>(new FillImpl<T>());
-}
-
-class FillFactoryImpl : public FillFactory {
- public:
-  OF_DISALLOW_COPY_AND_MOVE(FillFactoryImpl);
-  FillFactoryImpl() = default;
-  ~FillFactoryImpl() override = default;
-
-  std::unique_ptr<Fill> New(DataType data_type) override {
-#define MAKE_NEW_FILL_ENTRY(type_cpp, type_proto) {type_proto, NewFill<type_cpp>},
-
-    static const std::map<DataType, std::function<std::unique_ptr<Fill>()>> new_fill_handle{
-        OF_PP_FOR_EACH_TUPLE(MAKE_NEW_FILL_ENTRY, CUDA_PRIMITIVE_ALL_TYPE_SEQ)};
-
-#undef MAKE_NEW_FILL_ENTRY
-
-    const auto it = new_fill_handle.find(data_type);
-    if (it != new_fill_handle.end()) {
-      return it->second();
-    } else {
-      return nullptr;
-    }
-  }
-};
-
-REGISTER_PRIMITIVE_FACTORY(DeviceType::kCUDA, FillFactory, FillFactoryImpl);
-
-}  // namespace
-
-}  // namespace primitive
-}  // namespace ep
-
-}  // namespace oneflow
+#include "hip/hip_runtime.h"
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "oneflow/core/ep/include/primitive/fill.h"
+#include "oneflow/core/ep/rocm/primitive/type_seq.h"
+#include "oneflow/core/ep/rocm/cuda_stream.h"
+
+namespace oneflow {
+
+namespace ep {
+namespace primitive {
+
+namespace {
+
+template<size_t size>
+using Storage = typename std::aligned_storage<size, size>::type;
+
+template<typename T, size_t pack>
+union Pack {
+  static constexpr size_t size = sizeof(T) * pack;
+  explicit __device__ __host__ Pack(T value) {
+    static_assert(sizeof(Pack) == size, "");
+    static_assert(alignof(Pack) == size, "");
+#pragma unroll
+    for (size_t i = 0; i < pack; ++i) { elem[i] = value; }
+  }
+  T elem[pack];
+  Storage<size> storage;
+};
+
+template<typename T, size_t pack>
+__global__ void FillGpu(T* dst, T value, size_t count) {
+  const size_t pack_count = count / pack;
+  Pack<T, pack> pack_value(value);
+  auto* pack_dst = reinterpret_cast<decltype(pack_value.storage)*>(dst);
+  CUDA_1D_KERNEL_LOOP_T(size_t, i, pack_count) { pack_dst[i] = pack_value.storage; }
+  T* tail_dst = dst + pack_count * pack;
+  const size_t tail_count = count - pack_count * pack;
+  CUDA_1D_KERNEL_LOOP_T(size_t, i, tail_count) { tail_dst[i] = value; }
+}
+
+template<typename T>
+T GetValue(Scalar value) {
+  return value.Value<T>();
+}
+
+template<>
+half GetValue<half>(Scalar value) {
+  return static_cast<half>(GetValue<float>(value));
+}
+
+// #if CUDA_VERSION >= 11000
+
+// template<>
+// nv_bfloat16 GetValue<nv_bfloat16>(Scalar value) {
+//   return static_cast<nv_bfloat16>(GetValue<float>(value));
+// }
+
+// #endif  // CUDA_VERSION >= 11000
+
+template<typename T, size_t pack>
+typename std::enable_if<(pack != 0), void>::type LaunchPackFill(hipStream_t stream, T* dst,
+                                                                T value, size_t count) {
+  FillGpu<T, pack>
+      <<<BlocksNum4ThreadsNum(count), kCudaThreadsNumPerBlock, 0, stream>>>(dst, value, count);
+}
+
+template<typename T, size_t pack>
+typename std::enable_if<(pack == 0), void>::type LaunchPackFill(hipStream_t stream, T* dst,
+                                                                T value, size_t count) {
+  LOG(FATAL) << "wrong alignment";
+}
+
+template<typename T>
+void LaunchFill(hipStream_t stream, T* dst, T value, size_t count) {
+  auto uintptr = reinterpret_cast<std::uintptr_t>(dst);
+  if (uintptr % 16 == 0) {
+    LaunchPackFill<T, 16 / sizeof(T)>(stream, dst, value, count);
+  } else if (uintptr % 8 == 0) {
+    LaunchPackFill<T, 8 / sizeof(T)>(stream, dst, value, count);
+  } else if (uintptr % 4 == 0) {
+    LaunchPackFill<T, 4 / sizeof(T)>(stream, dst, value, count);
+  } else if (uintptr % 2 == 0) {
+    LaunchPackFill<T, 2 / sizeof(T)>(stream, dst, value, count);
+  } else {
+    LaunchPackFill<T, 1 / sizeof(T)>(stream, dst, value, count);
+  }
+}
+
+template<typename T>
+class FillImpl : public Fill {
+ public:
+  OF_DISALLOW_COPY_AND_MOVE(FillImpl);
+  FillImpl() = default;
+  ~FillImpl() override = default;
+
+  void Launch(Stream* stream, void* dst, Scalar value, size_t count) override {
+    hipStream_t cuda_stream = stream->As<CudaStream>()->cuda_stream();
+    LaunchFill<T>(cuda_stream, reinterpret_cast<T*>(dst), GetValue<T>(value), count);
+  }
+};
+
+template<typename T>
+std::unique_ptr<Fill> NewFill() {
+  return std::unique_ptr<Fill>(new FillImpl<T>());
+}
+
+class FillFactoryImpl : public FillFactory {
+ public:
+  OF_DISALLOW_COPY_AND_MOVE(FillFactoryImpl);
+  FillFactoryImpl() = default;
+  ~FillFactoryImpl() override = default;
+
+  std::unique_ptr<Fill> New(DataType data_type) override {
+#define MAKE_NEW_FILL_ENTRY(type_cpp, type_proto) {type_proto, NewFill<type_cpp>},
+
+    static const std::map<DataType, std::function<std::unique_ptr<Fill>()>> new_fill_handle{
+        OF_PP_FOR_EACH_TUPLE(MAKE_NEW_FILL_ENTRY, CUDA_PRIMITIVE_ALL_TYPE_SEQ)};
+
+#undef MAKE_NEW_FILL_ENTRY
+
+    const auto it = new_fill_handle.find(data_type);
+    if (it != new_fill_handle.end()) {
+      return it->second();
+    } else {
+      return nullptr;
+    }
+  }
+};
+
+REGISTER_PRIMITIVE_FACTORY(DeviceType::kCUDA, FillFactory, FillFactoryImpl);
+
+}  // namespace
+
+}  // namespace primitive
+}  // namespace ep
+
+}  // namespace oneflow
--- a/oneflow/core/ep/rocm/primitive/memcpy.cpp
+++ b/oneflow/core/ep/rocm/primitive/memcpy.cpp
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#ifdef WITH_ROCM
-
-#include "oneflow/core/ep/include/primitive/memcpy.h"
-#include "oneflow/core/ep/rocm/cuda_stream.h"
-#include <hip/hip_runtime.h>
-
-namespace oneflow {
-
-namespace ep {
-namespace primitive {
-
-namespace {
-
-class MemcpyImpl : public Memcpy {
- public:
-  OF_DISALLOW_COPY_AND_MOVE(MemcpyImpl);
-  MemcpyImpl() = default;
-  ~MemcpyImpl() override = default;
-
-  void Launch(Stream* stream, void* dst, const void* src, size_t count) override {
-    if (dst == src) { return; }
-    auto* cuda_stream = stream->As<CudaStream>();
-    OF_CUDA_CHECK(hipMemcpyAsync(dst, src, count, hipMemcpyDefault, cuda_stream->cuda_stream()));
-  }
-};
-
-class MemcpyFactoryImpl : public MemcpyFactory {
- public:
-  OF_DISALLOW_COPY_AND_MOVE(MemcpyFactoryImpl);
-  MemcpyFactoryImpl() = default;
-  ~MemcpyFactoryImpl() override = default;
-
-  std::unique_ptr<Memcpy> New(MemcpyKind kind) override {
-    return std::unique_ptr<Memcpy>(new MemcpyImpl());
-  }
-};
-
-REGISTER_PRIMITIVE_FACTORY(DeviceType::kCUDA, MemcpyFactory, MemcpyFactoryImpl);
-
-}  // namespace
-
-}  // namespace primitive
-}  // namespace ep
-
-}  // namespace oneflow
-
-#endif
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#ifdef WITH_ROCM
+
+#include "oneflow/core/ep/include/primitive/memcpy.h"
+#include "oneflow/core/ep/rocm/cuda_stream.h"
+#include <hip/hip_runtime.h>
+
+namespace oneflow {
+
+namespace ep {
+namespace primitive {
+
+namespace {
+
+class MemcpyImpl : public Memcpy {
+ public:
+  OF_DISALLOW_COPY_AND_MOVE(MemcpyImpl);
+  MemcpyImpl() = default;
+  ~MemcpyImpl() override = default;
+
+  void Launch(Stream* stream, void* dst, const void* src, size_t count) override {
+    if (dst == src) { return; }
+    auto* cuda_stream = stream->As<CudaStream>();
+    OF_CUDA_CHECK(hipMemcpyAsync(dst, src, count, hipMemcpyDefault, cuda_stream->cuda_stream()));
+  }
+};
+
+class MemcpyFactoryImpl : public MemcpyFactory {
+ public:
+  OF_DISALLOW_COPY_AND_MOVE(MemcpyFactoryImpl);
+  MemcpyFactoryImpl() = default;
+  ~MemcpyFactoryImpl() override = default;
+
+  std::unique_ptr<Memcpy> New(MemcpyKind kind) override {
+    return std::unique_ptr<Memcpy>(new MemcpyImpl());
+  }
+};
+
+REGISTER_PRIMITIVE_FACTORY(DeviceType::kCUDA, MemcpyFactory, MemcpyFactoryImpl);
+
+}  // namespace
+
+}  // namespace primitive
+}  // namespace ep
+
+}  // namespace oneflow
+
+#endif
--- a/oneflow/core/ep/rocm/primitive/memset.cpp
+++ b/oneflow/core/ep/rocm/primitive/memset.cpp
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#ifdef WITH_ROCM
-
-#include "oneflow/core/ep/include/primitive/memset.h"
-#include "oneflow/core/ep/rocm/cuda_stream.h"
-#include <hip/hip_runtime.h>
-
-namespace oneflow {
-
-namespace ep {
-namespace primitive {
-
-namespace {
-
-class MemsetImpl : public Memset {
- public:
-  OF_DISALLOW_COPY_AND_MOVE(MemsetImpl);
-  MemsetImpl() = default;
-  ~MemsetImpl() override = default;
-
-  void Launch(Stream* stream, void* ptr, int value, size_t count) override {
-    auto* cuda_stream = stream->As<CudaStream>();
-    OF_CUDA_CHECK(hipMemsetAsync(ptr, value, count, cuda_stream->cuda_stream()));
-  }
-};
-
-class MemsetFactoryImpl : public MemsetFactory {
- public:
-  OF_DISALLOW_COPY_AND_MOVE(MemsetFactoryImpl);
-  MemsetFactoryImpl() = default;
-  ~MemsetFactoryImpl() override = default;
-
-  std::unique_ptr<Memset> New() override { return std::unique_ptr<Memset>(new MemsetImpl()); }
-};
-
-REGISTER_PRIMITIVE_FACTORY(DeviceType::kCUDA, MemsetFactory, MemsetFactoryImpl);
-
-}  // namespace
-
-}  // namespace primitive
-}  // namespace ep
-
-}  // namespace oneflow
-
-#endif
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#ifdef WITH_ROCM
+
+#include "oneflow/core/ep/include/primitive/memset.h"
+#include "oneflow/core/ep/rocm/cuda_stream.h"
+#include <hip/hip_runtime.h>
+
+namespace oneflow {
+
+namespace ep {
+namespace primitive {
+
+namespace {
+
+class MemsetImpl : public Memset {
+ public:
+  OF_DISALLOW_COPY_AND_MOVE(MemsetImpl);
+  MemsetImpl() = default;
+  ~MemsetImpl() override = default;
+
+  void Launch(Stream* stream, void* ptr, int value, size_t count) override {
+    auto* cuda_stream = stream->As<CudaStream>();
+    OF_CUDA_CHECK(hipMemsetAsync(ptr, value, count, cuda_stream->cuda_stream()));
+  }
+};
+
+class MemsetFactoryImpl : public MemsetFactory {
+ public:
+  OF_DISALLOW_COPY_AND_MOVE(MemsetFactoryImpl);
+  MemsetFactoryImpl() = default;
+  ~MemsetFactoryImpl() override = default;
+
+  std::unique_ptr<Memset> New() override { return std::unique_ptr<Memset>(new MemsetImpl()); }
+};
+
+REGISTER_PRIMITIVE_FACTORY(DeviceType::kCUDA, MemsetFactory, MemsetFactoryImpl);
+
+}  // namespace
+
+}  // namespace primitive
+}  // namespace ep
+
+}  // namespace oneflow
+
+#endif
--- a/oneflow/core/ep/rocm/primitive/permute.hip.cpp
+++ b/oneflow/core/ep/rocm/primitive/permute.hip.cpp
-#include "hip/hip_runtime.h"
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#include "oneflow/core/ep/include/primitive/permute.h"
-#include "oneflow/core/ep/common/primitive/permute_impl.h"
-#include "oneflow/core/ep/rocm/cuda_stream.h"
-#include <hip/hip_runtime.h>
-
-namespace oneflow {
-
-namespace ep {
-namespace primitive {
-
-namespace permute {
-
-namespace internal {
-
-namespace {
-
-constexpr int32_t kMov4TileSize = 32;
-constexpr int32_t kMov2TileSize = 64;
-constexpr int32_t kBlockRows = 8;
-
-template<size_t num_dims, size_t movement_size, typename IndexType>
-__global__ void PermuteKernel(PermuteKernelParams<num_dims, IndexType> params) {
-  using T = typename std::aligned_storage<movement_size, movement_size>::type;
-  const T* src = reinterpret_cast<const T*>(params.src);
-  T* dst = reinterpret_cast<T*>(params.dst);
-  IndexType src_index[num_dims];
-  IndexType dst_index[num_dims];
-  CUDA_1D_KERNEL_LOOP_T(IndexType, i, params.count) {
-    params.dst_index_helper.OffsetToNdIndex(i, dst_index);
-#pragma unroll
-    for (size_t dim = 0; dim < num_dims; ++dim) {
-      src_index[params.permutation[dim]] = dst_index[dim];
-    }
-    IndexType src_offset = params.src_index_helper.NdIndexToOffset(src_index);
-    dst[i] = src[src_offset];
-  }
-}
-
-// (B, X, Y) -> (B, Y, X)
-// refer from https://developer.nvidia.com/blog/efficient-matrix-transpose-cuda-cc/
-template<size_t num_dims, size_t movement_size, size_t tile_size, typename IndexType>
-__global__ void BatchTransposeKernel(const void* src_ptr, void* dst_ptr, IndexType rows,
-                                     IndexType cols, IndexType num_tile_rows,
-                                     IndexType num_tile_cols, int32_t block_nums) {
-  const IndexType src_rows = rows;
-  const IndexType src_cols = cols;
-  const IndexType dst_rows = cols;
-  const IndexType dst_cols = rows;
-
-  using T = typename std::aligned_storage<movement_size, movement_size>::type;
-  __shared__ T tile[tile_size][tile_size + 1];  // To avoid bank conflict.
-
-  const T* src = reinterpret_cast<const T*>(src_ptr);
-  T* dst = reinterpret_cast<T*>(dst_ptr);
-
-  IndexType batch_num_tile = num_tile_rows * num_tile_cols;
-  for (int i = blockIdx.x, step = gridDim.x; i < block_nums; i += step) {
-    const IndexType batch_index = i / batch_num_tile;  // the index of batch.
-    const IndexType tile_index =
-        i - batch_index * batch_num_tile;  // equal to i % (num_tile_rows*num_tile_cols). the
-                                           // flatten index of tile in a batch.
-
-    const IndexType tile_row_index =
-        tile_index / num_tile_cols;  // the row index of tile in a batch.
-    const IndexType tile_col_index =
-        tile_index
-        - tile_row_index
-              * num_tile_cols;  // equal to k % num_tile_cols. the col index of tile in a batch.
-
-    const IndexType offset = batch_index * src_rows * src_cols;
-    {
-      IndexType col_in_tile = threadIdx.x;
-      IndexType col_in_matrix = tile_col_index * tile_size + threadIdx.x;
-#pragma unroll
-      for (IndexType row_in_tile = threadIdx.y; row_in_tile < tile_size;
-           row_in_tile += kBlockRows) {
-        IndexType row_in_matrix = row_in_tile + tile_row_index * tile_size;
-        if (col_in_matrix < src_cols && row_in_matrix < src_rows) {
-          tile[row_in_tile][col_in_tile] = src[offset + row_in_matrix * src_cols + col_in_matrix];
-        }
-      }
-    }
-    __syncthreads();
-    {
-      IndexType col_in_tile = threadIdx.x;
-      IndexType col_in_matrix = tile_row_index * tile_size + threadIdx.x;
-#pragma unroll
-      for (IndexType row_in_tile = threadIdx.y; row_in_tile < tile_size;
-           row_in_tile += kBlockRows) {
-        IndexType row_in_matrix = row_in_tile + tile_col_index * tile_size;
-        if (col_in_matrix < dst_cols && row_in_matrix < dst_rows) {
-          dst[offset + row_in_matrix * dst_cols + col_in_matrix] = tile[col_in_tile][row_in_tile];
-        }
-      }
-    }
-    __syncthreads();
-  }
-}
-
-/*
-Here is a Movementsie=2 version of Batch Transpose.
-When the H W can be divided by 2. we can read data use movementsize=4, and write back as
-movementsize=4.
-*/
-template<size_t num_dims, size_t tile_size, typename IndexType>
-__global__ void BatchTransposeMovement2Kernel(const void* src_ptr, void* dst_ptr, IndexType rows,
-                                              IndexType cols, IndexType num_tile_rows,
-                                              IndexType num_tile_cols, int32_t block_nums) {
-  const IndexType src_rows = rows;
-  const IndexType src_cols = cols;
-  const IndexType dst_rows = cols;
-  const IndexType dst_cols = rows;
-
-  static_assert(tile_size % 2 == 0, "");
-  using T_MOV2 = typename std::aligned_storage<2, 2>::type;
-  using T_MOV4 = typename std::aligned_storage<4, 4>::type;
-
-  const T_MOV4* src = reinterpret_cast<const T_MOV4*>(src_ptr);
-  T_MOV4* dst = reinterpret_cast<T_MOV4*>(dst_ptr);
-
-  // Use union structure to process Load and Store.
-  __shared__ union {
-    T_MOV2 tile_m2[tile_size][tile_size + 2];      // half [64][66]
-    T_MOV4 tile_m4[tile_size][tile_size / 2 + 1];  // half2 [64][33]
-  } tile_mem;
-
-  IndexType batch_num_tile = num_tile_rows * num_tile_cols;
-  for (int i = blockIdx.x, step = gridDim.x; i < block_nums; i += step) {
-    const IndexType batch_index = i / batch_num_tile;  // the index of batch.
-    const IndexType tile_index =
-        i - batch_index * batch_num_tile;  // equal to i % (num_tile_rows*num_tile_cols). the
-                                           // flatten index of tile in a batch.
-
-    const IndexType tile_row_index =
-        tile_index / num_tile_cols;  // the row index of tile in a batch.
-    const IndexType tile_col_index =
-        tile_index
-        - tile_row_index
-              * num_tile_cols;  // equal to k % num_tile_cols. the col index of tile in a batch.
-
-    const IndexType offset = batch_index * src_rows * src_cols;
-    {
-      IndexType col_in_tile = threadIdx.x;
-      IndexType col_in_matrix = tile_col_index * tile_size + threadIdx.x * 2;
-#pragma unroll
-      for (IndexType row_in_tile = threadIdx.y; row_in_tile < tile_size;
-           row_in_tile += kBlockRows) {
-        IndexType row_in_matrix = row_in_tile + tile_row_index * tile_size;
-        if (col_in_matrix < src_cols && row_in_matrix < src_rows) {
-          tile_mem.tile_m4[row_in_tile][col_in_tile] =
-              src[(offset + row_in_matrix * src_cols + col_in_matrix) / 2];
-        }
-      }
-    }
-    __syncthreads();
-    {
-      IndexType col_in_tile = threadIdx.x;
-      IndexType col_in_matrix = tile_row_index * tile_size + threadIdx.x * 2;
-#pragma unroll
-      for (IndexType row_in_tile = threadIdx.y; row_in_tile < tile_size;
-           row_in_tile += kBlockRows) {
-        IndexType row_in_matrix = row_in_tile + tile_col_index * tile_size;
-        union {
-          T_MOV4 m4;
-          T_MOV2 m2[2];
-        } tmp_storage;
-
-        if (col_in_matrix < dst_cols && row_in_matrix < dst_rows) {
-          tmp_storage.m2[0] = tile_mem.tile_m2[col_in_tile * 2][row_in_tile];
-          tmp_storage.m2[1] = tile_mem.tile_m2[col_in_tile * 2 + 1][row_in_tile];
-          dst[(offset + row_in_matrix * dst_cols + col_in_matrix) / 2] = tmp_storage.m4;
-        }
-      }
-    }
-    __syncthreads();
-  }
-}
-
-template<size_t num_dims, size_t movement_size, size_t tile_size, typename IndexType>
-void LaunchBatchTransposeKernel(hipStream_t& cuda_stream,
-                                const PermuteKernelParams<num_dims, IndexType>& params,
-                                const IndexType& num_batches, const IndexType& rows,
-                                const IndexType& cols) {
-  IndexType num_tile_rows = (rows + tile_size - 1) / tile_size;
-  IndexType num_tile_cols = (cols + tile_size - 1) / tile_size;
-  const int32_t block_nums = num_batches * num_tile_rows * num_tile_cols;
-  int32_t launched_block_nums = std::min(block_nums, kCudaMaxBlocksNum);
-  if (tile_size == kMov2TileSize) {
-    const int32_t half2_thread = tile_size / 2;  // cause each thread process two half elements.
-    BatchTransposeMovement2Kernel<num_dims, kMov2TileSize, IndexType>
-        <<<launched_block_nums, dim3(half2_thread, kBlockRows), 0, cuda_stream>>>(
-            params.src, params.dst, rows, cols, num_tile_rows, num_tile_cols,
-            block_nums);  // Set threads num as 32x8 cause each threads
-                          // process 4 elements to 64x66 half share memory.
-  } else {
-    BatchTransposeKernel<num_dims, movement_size, tile_size, IndexType>
-        <<<launched_block_nums, dim3(tile_size, kBlockRows), 0, cuda_stream>>>(
-            params.src, params.dst, rows, cols, num_tile_rows, num_tile_cols, block_nums);
-  }
-}
-
-template<size_t tile_size, typename IndexType>
-bool CheckIfGreaterEqualThanTileSize(const IndexType& rows, const IndexType& cols) {
-  if (rows < tile_size || cols < tile_size) { return false; }
-  return true;
-}
-
-template<size_t num_dims, size_t tile_size, typename IndexType>
-bool CheckLaunchBatchTranspose(const int* permutation, const IndexType& num_batches,
-                               const IndexType& rows, const IndexType& cols) {
-  if (CheckIfGreaterEqualThanTileSize<tile_size, IndexType>(rows, cols)) {
-    if (num_batches == 1 && permutation[1] == 0 && permutation[0] == 1) {
-      // 2d tensor case: (0, 1) -> (1, 0)
-      return true;
-    } else if (num_dims == 3 && permutation[2] == 1 && permutation[1] == 2) {
-      // 3d tensor case: (0, 1, 2) -> (0, 2, 1)
-      return true;
-    } else {
-      return false;
-    }
-  }
-  return false;
-}
-
-template<typename IndexType, size_t movement_size>
-bool CheckUseMov2(const IndexType& rows, const IndexType& cols, const void* src, void* dst) {
-  auto src_ptr = reinterpret_cast<std::uintptr_t>(src);
-  auto dst_ptr = reinterpret_cast<std::uintptr_t>(dst);
-  return (movement_size == 2) && (rows % 2 == 0) && (cols % 2 == 0) && (src_ptr % 4 == 0)
-         && (dst_ptr % 4 == 0);
-}
-
-template<size_t num_dims, typename IndexType>
-void InferBatchTransposeShape(const int64_t* src_dims, IndexType* num_batches, IndexType* rows,
-                              IndexType* cols) {
-  if (num_dims == 2) {
-    *num_batches = 1;
-    *rows = src_dims[0];
-    *cols = src_dims[1];
-  } else {
-    *num_batches = src_dims[0];
-    *rows = src_dims[1];
-    *cols = src_dims[2];
-  }
-}
-
-template<size_t num_dims, size_t movement_size, typename IndexType>
-void LaunchKernel(Stream* stream, const int64_t* src_dims, const void* src, const int* permutation,
-                  void* dst, size_t count) {
-  PermuteKernelParams<num_dims, IndexType> params =
-      MakePermuteParams<num_dims, IndexType>(src_dims, src, permutation, dst, count);
-  hipStream_t cuda_stream = stream->As<CudaStream>()->cuda_stream();
-
-  if (num_dims == 2 || num_dims == 3) {
-    IndexType num_batches;
-    IndexType rows;
-    IndexType cols;
-    InferBatchTransposeShape<num_dims, IndexType>(src_dims, &num_batches, &rows, &cols);
-    if (CheckLaunchBatchTranspose<num_dims, kMov4TileSize>(params.permutation, num_batches, rows,
-                                                           cols)) {
-      if (CheckUseMov2<IndexType, movement_size>(rows, cols, src, dst)) {
-        LaunchBatchTransposeKernel<num_dims, 2, kMov2TileSize, IndexType>(cuda_stream, params,
-                                                                          num_batches, rows, cols);
-      } else {
-        LaunchBatchTransposeKernel<num_dims, movement_size, kMov4TileSize, IndexType>(
-            cuda_stream, params, num_batches, rows, cols);
-      }
-    } else {
-      PermuteKernel<num_dims, movement_size, IndexType>
-          <<<BlocksNum4ThreadsNum(params.count), kCudaThreadsNumPerBlock, 0, cuda_stream>>>(params);
-    }
-  } else {
-    PermuteKernel<num_dims, movement_size, IndexType>
-        <<<BlocksNum4ThreadsNum(params.count), kCudaThreadsNumPerBlock, 0, cuda_stream>>>(params);
-  }
-}
-
-class PermuteImpl : public Permute {
- public:
-  OF_DISALLOW_COPY_AND_MOVE(PermuteImpl);
-  PermuteImpl() = default;
-  ~PermuteImpl() override = default;
-
-  using Permute::Launch;
-  void Launch(Stream* stream, DataType data_type, size_t num_dims, const int64_t* src_dims,
-              const void* src, const int* permutation, void* dst) override {
-    SimplifyThenLaunch(stream, data_type, num_dims, src_dims, src, permutation, dst);
-  }
-};
-
-class PermuteFactoryImpl : public PermuteFactory {
- public:
-  OF_DISALLOW_COPY_AND_MOVE(PermuteFactoryImpl);
-  PermuteFactoryImpl() = default;
-  ~PermuteFactoryImpl() override = default;
-
-  std::unique_ptr<Permute> New(size_t max_num_dims) override {
-    if (max_num_dims <= kMaxNumDims) {
-      return std::unique_ptr<Permute>(new PermuteImpl());
-    } else {
-      return nullptr;
-    }
-  }
-};
-
-REGISTER_PRIMITIVE_FACTORY(DeviceType::kCUDA, PermuteFactory, PermuteFactoryImpl);
-
-}  // namespace
-
-}  // namespace internal
-
-}  // namespace permute
-
-}  // namespace primitive
-}  // namespace ep
-
-}  // namespace oneflow
+#include "hip/hip_runtime.h"
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "oneflow/core/ep/include/primitive/permute.h"
+#include "oneflow/core/ep/common/primitive/permute_impl.h"
+#include "oneflow/core/ep/rocm/cuda_stream.h"
+#include <hip/hip_runtime.h>
+
+namespace oneflow {
+
+namespace ep {
+namespace primitive {
+
+namespace permute {
+
+namespace internal {
+
+namespace {
+
+constexpr int32_t kMov4TileSize = 32;
+constexpr int32_t kMov2TileSize = 64;
+constexpr int32_t kBlockRows = 8;
+
+template<size_t num_dims, size_t movement_size, typename IndexType>
+__global__ void PermuteKernel(PermuteKernelParams<num_dims, IndexType> params) {
+  using T = typename std::aligned_storage<movement_size, movement_size>::type;
+  const T* src = reinterpret_cast<const T*>(params.src);
+  T* dst = reinterpret_cast<T*>(params.dst);
+  IndexType src_index[num_dims];
+  IndexType dst_index[num_dims];
+  CUDA_1D_KERNEL_LOOP_T(IndexType, i, params.count) {
+    params.dst_index_helper.OffsetToNdIndex(i, dst_index);
+#pragma unroll
+    for (size_t dim = 0; dim < num_dims; ++dim) {
+      src_index[params.permutation[dim]] = dst_index[dim];
+    }
+    IndexType src_offset = params.src_index_helper.NdIndexToOffset(src_index);
+    dst[i] = src[src_offset];
+  }
+}
+
+// (B, X, Y) -> (B, Y, X)
+// refer from https://developer.nvidia.com/blog/efficient-matrix-transpose-cuda-cc/
+template<size_t num_dims, size_t movement_size, size_t tile_size, typename IndexType>
+__global__ void BatchTransposeKernel(const void* src_ptr, void* dst_ptr, IndexType rows,
+                                     IndexType cols, IndexType num_tile_rows,
+                                     IndexType num_tile_cols, int32_t block_nums) {
+  const IndexType src_rows = rows;
+  const IndexType src_cols = cols;
+  const IndexType dst_rows = cols;
+  const IndexType dst_cols = rows;
+
+  using T = typename std::aligned_storage<movement_size, movement_size>::type;
+  __shared__ T tile[tile_size][tile_size + 1];  // To avoid bank conflict.
+
+  const T* src = reinterpret_cast<const T*>(src_ptr);
+  T* dst = reinterpret_cast<T*>(dst_ptr);
+
+  IndexType batch_num_tile = num_tile_rows * num_tile_cols;
+  for (int i = blockIdx.x, step = gridDim.x; i < block_nums; i += step) {
+    const IndexType batch_index = i / batch_num_tile;  // the index of batch.
+    const IndexType tile_index =
+        i - batch_index * batch_num_tile;  // equal to i % (num_tile_rows*num_tile_cols). the
+                                           // flatten index of tile in a batch.
+
+    const IndexType tile_row_index =
+        tile_index / num_tile_cols;  // the row index of tile in a batch.
+    const IndexType tile_col_index =
+        tile_index
+        - tile_row_index
+              * num_tile_cols;  // equal to k % num_tile_cols. the col index of tile in a batch.
+
+    const IndexType offset = batch_index * src_rows * src_cols;
+    {
+      IndexType col_in_tile = threadIdx.x;
+      IndexType col_in_matrix = tile_col_index * tile_size + threadIdx.x;
+#pragma unroll
+      for (IndexType row_in_tile = threadIdx.y; row_in_tile < tile_size;
+           row_in_tile += kBlockRows) {
+        IndexType row_in_matrix = row_in_tile + tile_row_index * tile_size;
+        if (col_in_matrix < src_cols && row_in_matrix < src_rows) {
+          tile[row_in_tile][col_in_tile] = src[offset + row_in_matrix * src_cols + col_in_matrix];
+        }
+      }
+    }
+    __syncthreads();
+    {
+      IndexType col_in_tile = threadIdx.x;
+      IndexType col_in_matrix = tile_row_index * tile_size + threadIdx.x;
+#pragma unroll
+      for (IndexType row_in_tile = threadIdx.y; row_in_tile < tile_size;
+           row_in_tile += kBlockRows) {
+        IndexType row_in_matrix = row_in_tile + tile_col_index * tile_size;
+        if (col_in_matrix < dst_cols && row_in_matrix < dst_rows) {
+          dst[offset + row_in_matrix * dst_cols + col_in_matrix] = tile[col_in_tile][row_in_tile];
+        }
+      }
+    }
+    __syncthreads();
+  }
+}
+
+/*
+Here is a Movementsie=2 version of Batch Transpose.
+When the H W can be divided by 2. we can read data use movementsize=4, and write back as
+movementsize=4.
+*/
+template<size_t num_dims, size_t tile_size, typename IndexType>
+__global__ void BatchTransposeMovement2Kernel(const void* src_ptr, void* dst_ptr, IndexType rows,
+                                              IndexType cols, IndexType num_tile_rows,
+                                              IndexType num_tile_cols, int32_t block_nums) {
+  const IndexType src_rows = rows;
+  const IndexType src_cols = cols;
+  const IndexType dst_rows = cols;
+  const IndexType dst_cols = rows;
+
+  static_assert(tile_size % 2 == 0, "");
+  using T_MOV2 = typename std::aligned_storage<2, 2>::type;
+  using T_MOV4 = typename std::aligned_storage<4, 4>::type;
+
+  const T_MOV4* src = reinterpret_cast<const T_MOV4*>(src_ptr);
+  T_MOV4* dst = reinterpret_cast<T_MOV4*>(dst_ptr);
+
+  // Use union structure to process Load and Store.
+  __shared__ union {
+    T_MOV2 tile_m2[tile_size][tile_size + 2];      // half [64][66]
+    T_MOV4 tile_m4[tile_size][tile_size / 2 + 1];  // half2 [64][33]
+  } tile_mem;
+
+  IndexType batch_num_tile = num_tile_rows * num_tile_cols;
+  for (int i = blockIdx.x, step = gridDim.x; i < block_nums; i += step) {
+    const IndexType batch_index = i / batch_num_tile;  // the index of batch.
+    const IndexType tile_index =
+        i - batch_index * batch_num_tile;  // equal to i % (num_tile_rows*num_tile_cols). the
+                                           // flatten index of tile in a batch.
+
+    const IndexType tile_row_index =
+        tile_index / num_tile_cols;  // the row index of tile in a batch.
+    const IndexType tile_col_index =
+        tile_index
+        - tile_row_index
+              * num_tile_cols;  // equal to k % num_tile_cols. the col index of tile in a batch.
+
+    const IndexType offset = batch_index * src_rows * src_cols;
+    {
+      IndexType col_in_tile = threadIdx.x;
+      IndexType col_in_matrix = tile_col_index * tile_size + threadIdx.x * 2;
+#pragma unroll
+      for (IndexType row_in_tile = threadIdx.y; row_in_tile < tile_size;
+           row_in_tile += kBlockRows) {
+        IndexType row_in_matrix = row_in_tile + tile_row_index * tile_size;
+        if (col_in_matrix < src_cols && row_in_matrix < src_rows) {
+          tile_mem.tile_m4[row_in_tile][col_in_tile] =
+              src[(offset + row_in_matrix * src_cols + col_in_matrix) / 2];
+        }
+      }
+    }
+    __syncthreads();
+    {
+      IndexType col_in_tile = threadIdx.x;
+      IndexType col_in_matrix = tile_row_index * tile_size + threadIdx.x * 2;
+#pragma unroll
+      for (IndexType row_in_tile = threadIdx.y; row_in_tile < tile_size;
+           row_in_tile += kBlockRows) {
+        IndexType row_in_matrix = row_in_tile + tile_col_index * tile_size;
+        union {
+          T_MOV4 m4;
+          T_MOV2 m2[2];
+        } tmp_storage;
+
+        if (col_in_matrix < dst_cols && row_in_matrix < dst_rows) {
+          tmp_storage.m2[0] = tile_mem.tile_m2[col_in_tile * 2][row_in_tile];
+          tmp_storage.m2[1] = tile_mem.tile_m2[col_in_tile * 2 + 1][row_in_tile];
+          dst[(offset + row_in_matrix * dst_cols + col_in_matrix) / 2] = tmp_storage.m4;
+        }
+      }
+    }
+    __syncthreads();
+  }
+}
+
+template<size_t num_dims, size_t movement_size, size_t tile_size, typename IndexType>
+void LaunchBatchTransposeKernel(hipStream_t& cuda_stream,
+                                const PermuteKernelParams<num_dims, IndexType>& params,
+                                const IndexType& num_batches, const IndexType& rows,
+                                const IndexType& cols) {
+  IndexType num_tile_rows = (rows + tile_size - 1) / tile_size;
+  IndexType num_tile_cols = (cols + tile_size - 1) / tile_size;
+  const int32_t block_nums = num_batches * num_tile_rows * num_tile_cols;
+  int32_t launched_block_nums = std::min(block_nums, kCudaMaxBlocksNum);
+  if (tile_size == kMov2TileSize) {
+    const int32_t half2_thread = tile_size / 2;  // cause each thread process two half elements.
+    BatchTransposeMovement2Kernel<num_dims, kMov2TileSize, IndexType>
+        <<<launched_block_nums, dim3(half2_thread, kBlockRows), 0, cuda_stream>>>(
+            params.src, params.dst, rows, cols, num_tile_rows, num_tile_cols,
+            block_nums);  // Set threads num as 32x8 cause each threads
+                          // process 4 elements to 64x66 half share memory.
+  } else {
+    BatchTransposeKernel<num_dims, movement_size, tile_size, IndexType>
+        <<<launched_block_nums, dim3(tile_size, kBlockRows), 0, cuda_stream>>>(
+            params.src, params.dst, rows, cols, num_tile_rows, num_tile_cols, block_nums);
+  }
+}
+
+template<size_t tile_size, typename IndexType>
+bool CheckIfGreaterEqualThanTileSize(const IndexType& rows, const IndexType& cols) {
+  if (rows < tile_size || cols < tile_size) { return false; }
+  return true;
+}
+
+template<size_t num_dims, size_t tile_size, typename IndexType>
+bool CheckLaunchBatchTranspose(const int* permutation, const IndexType& num_batches,
+                               const IndexType& rows, const IndexType& cols) {
+  if (CheckIfGreaterEqualThanTileSize<tile_size, IndexType>(rows, cols)) {
+    if (num_batches == 1 && permutation[1] == 0 && permutation[0] == 1) {
+      // 2d tensor case: (0, 1) -> (1, 0)
+      return true;
+    } else if (num_dims == 3 && permutation[2] == 1 && permutation[1] == 2) {
+      // 3d tensor case: (0, 1, 2) -> (0, 2, 1)
+      return true;
+    } else {
+      return false;
+    }
+  }
+  return false;
+}
+
+template<typename IndexType, size_t movement_size>
+bool CheckUseMov2(const IndexType& rows, const IndexType& cols, const void* src, void* dst) {
+  auto src_ptr = reinterpret_cast<std::uintptr_t>(src);
+  auto dst_ptr = reinterpret_cast<std::uintptr_t>(dst);
+  return (movement_size == 2) && (rows % 2 == 0) && (cols % 2 == 0) && (src_ptr % 4 == 0)
+         && (dst_ptr % 4 == 0);
+}
+
+template<size_t num_dims, typename IndexType>
+void InferBatchTransposeShape(const int64_t* src_dims, IndexType* num_batches, IndexType* rows,
+                              IndexType* cols) {
+  if (num_dims == 2) {
+    *num_batches = 1;
+    *rows = src_dims[0];
+    *cols = src_dims[1];
+  } else {
+    *num_batches = src_dims[0];
+    *rows = src_dims[1];
+    *cols = src_dims[2];
+  }
+}
+
+template<size_t num_dims, size_t movement_size, typename IndexType>
+void LaunchKernel(Stream* stream, const int64_t* src_dims, const void* src, const int* permutation,
+                  void* dst, size_t count) {
+  PermuteKernelParams<num_dims, IndexType> params =
+      MakePermuteParams<num_dims, IndexType>(src_dims, src, permutation, dst, count);
+  hipStream_t cuda_stream = stream->As<CudaStream>()->cuda_stream();
+
+  if (num_dims == 2 || num_dims == 3) {
+    IndexType num_batches;
+    IndexType rows;
+    IndexType cols;
+    InferBatchTransposeShape<num_dims, IndexType>(src_dims, &num_batches, &rows, &cols);
+    if (CheckLaunchBatchTranspose<num_dims, kMov4TileSize>(params.permutation, num_batches, rows,
+                                                           cols)) {
+      if (CheckUseMov2<IndexType, movement_size>(rows, cols, src, dst)) {
+        LaunchBatchTransposeKernel<num_dims, 2, kMov2TileSize, IndexType>(cuda_stream, params,
+                                                                          num_batches, rows, cols);
+      } else {
+        LaunchBatchTransposeKernel<num_dims, movement_size, kMov4TileSize, IndexType>(
+            cuda_stream, params, num_batches, rows, cols);
+      }
+    } else {
+      PermuteKernel<num_dims, movement_size, IndexType>
+          <<<BlocksNum4ThreadsNum(params.count), kCudaThreadsNumPerBlock, 0, cuda_stream>>>(params);
+    }
+  } else {
+    PermuteKernel<num_dims, movement_size, IndexType>
+        <<<BlocksNum4ThreadsNum(params.count), kCudaThreadsNumPerBlock, 0, cuda_stream>>>(params);
+  }
+}
+
+class PermuteImpl : public Permute {
+ public:
+  OF_DISALLOW_COPY_AND_MOVE(PermuteImpl);
+  PermuteImpl() = default;
+  ~PermuteImpl() override = default;
+
+  using Permute::Launch;
+  void Launch(Stream* stream, DataType data_type, size_t num_dims, const int64_t* src_dims,
+              const void* src, const int* permutation, void* dst) override {
+    SimplifyThenLaunch(stream, data_type, num_dims, src_dims, src, permutation, dst);
+  }
+};
+
+class PermuteFactoryImpl : public PermuteFactory {
+ public:
+  OF_DISALLOW_COPY_AND_MOVE(PermuteFactoryImpl);
+  PermuteFactoryImpl() = default;
+  ~PermuteFactoryImpl() override = default;
+
+  std::unique_ptr<Permute> New(size_t max_num_dims) override {
+    if (max_num_dims <= kMaxNumDims) {
+      return std::unique_ptr<Permute>(new PermuteImpl());
+    } else {
+      return nullptr;
+    }
+  }
+};
+
+REGISTER_PRIMITIVE_FACTORY(DeviceType::kCUDA, PermuteFactory, PermuteFactoryImpl);
+
+}  // namespace
+
+}  // namespace internal
+
+}  // namespace permute
+
+}  // namespace primitive
+}  // namespace ep
+
+}  // namespace oneflow
--- a/oneflow/core/ep/rocm/primitive/softmax.hip.cpp
+++ b/oneflow/core/ep/rocm/primitive/softmax.hip.cpp
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#include "oneflow/core/common/preprocessor.h"
-#include "oneflow/core/ep/include/primitive/softmax.h"
-#include "oneflow/core/ep/include/primitive/log_softmax.h"
-#include "oneflow/core/ep/rocm/primitive/type_seq.h"
-#include "oneflow/core/hip/softmax.hip.h"
-#include "oneflow/core/ep/rocm/cuda_stream.h"
-
-namespace oneflow {
-
-namespace ep {
-namespace primitive {
-
-namespace {
-
-enum class Algorithm {
-  kSoftmax,
-  kLogSoftmax,
-};
-
-template<Algorithm algorithm, typename T>
-void SoftmaxGpu(hipStream_t cuda_stream, size_t rows, size_t cols, const T* x, T* y) {
-  using ComputeType = typename cuda::softmax::DefaultComputeType<T>::type;
-  oneflow::cuda::softmax::DirectLoad<T, ComputeType> load(x, cols);
-  oneflow::cuda::softmax::DirectStore<ComputeType, T> store(y, cols);
-  if (algorithm == Algorithm::kSoftmax) {
-    OF_CUDA_CHECK((cuda::softmax::DispatchSoftmax<decltype(load), decltype(store), ComputeType>(
-        cuda_stream, load, store, rows, cols)));
-  } else if (algorithm == Algorithm::kLogSoftmax) {
-    OF_CUDA_CHECK((cuda::softmax::DispatchLogSoftmax<decltype(load), decltype(store), ComputeType>(
-        cuda_stream, load, store, rows, cols)));
-  } else {
-    UNIMPLEMENTED();
-  }
-}
-
-template<typename SoftmaxBase, Algorithm algorithm, typename T>
-class SoftmaxImpl : public SoftmaxBase {
- public:
-  OF_DISALLOW_COPY_AND_MOVE(SoftmaxImpl);
-  SoftmaxImpl() = default;
-  ~SoftmaxImpl() override = default;
-
-  void Launch(Stream* stream, size_t rows, size_t cols, const void* x, void* y) override {
-    hipStream_t cuda_stream = stream->As<CudaStream>()->cuda_stream();
-    SoftmaxGpu<algorithm, T>(cuda_stream, rows, cols, reinterpret_cast<const T*>(x),
-                             reinterpret_cast<T*>(y));
-  }
-};
-
-template<typename SoftmaxBase, Algorithm algorithm, typename T>
-std::unique_ptr<SoftmaxBase> NewSoftmax() {
-  return std::unique_ptr<SoftmaxBase>(new SoftmaxImpl<SoftmaxBase, algorithm, T>());
-}
-
-template<typename FactoryBase, typename SoftmaxBase, Algorithm algorithm>
-class GenericSoftmaxFactoryImpl : public FactoryBase {
- public:
-  OF_DISALLOW_COPY_AND_MOVE(GenericSoftmaxFactoryImpl);
-  GenericSoftmaxFactoryImpl() = default;
-  ~GenericSoftmaxFactoryImpl() override = default;
-
-  std::unique_ptr<SoftmaxBase> New(DataType data_type) override {
-#define MAKE_NEW_SOFTMAX_ENTRY(type_cpp, type_proto) \
-  {type_proto, NewSoftmax<SoftmaxBase, algorithm, type_cpp>},
-
-    static const std::map<DataType, std::function<std::unique_ptr<SoftmaxBase>()>>
-        new_softmax_handle{
-            OF_PP_FOR_EACH_TUPLE(MAKE_NEW_SOFTMAX_ENTRY, CUDA_PRIMITIVE_FLOATING_TYPE_SEQ)};
-
-#undef MAKE_NEW_SOFTMAX_ENTRY
-
-    const auto it = new_softmax_handle.find(data_type);
-    if (it != new_softmax_handle.end()) {
-      return it->second();
-    } else {
-      return nullptr;
-    }
-  }
-};
-
-using SoftmaxFactoryImpl = GenericSoftmaxFactoryImpl<SoftmaxFactory, Softmax, Algorithm::kSoftmax>;
-using LogSoftmaxFactoryImpl =
-    GenericSoftmaxFactoryImpl<LogSoftmaxFactory, LogSoftmax, Algorithm::kLogSoftmax>;
-REGISTER_PRIMITIVE_FACTORY(DeviceType::kCUDA, SoftmaxFactory, SoftmaxFactoryImpl);
-REGISTER_PRIMITIVE_FACTORY(DeviceType::kCUDA, LogSoftmaxFactory, LogSoftmaxFactoryImpl);
-
-}  // namespace
-
-}  // namespace primitive
-}  // namespace ep
-
-}  // namespace oneflow
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "oneflow/core/common/preprocessor.h"
+#include "oneflow/core/ep/include/primitive/softmax.h"
+#include "oneflow/core/ep/include/primitive/log_softmax.h"
+#include "oneflow/core/ep/rocm/primitive/type_seq.h"
+#include "oneflow/core/hip/softmax.hip.h"
+#include "oneflow/core/ep/rocm/cuda_stream.h"
+
+namespace oneflow {
+
+namespace ep {
+namespace primitive {
+
+namespace {
+
+enum class Algorithm {
+  kSoftmax,
+  kLogSoftmax,
+};
+
+template<Algorithm algorithm, typename T>
+void SoftmaxGpu(hipStream_t cuda_stream, size_t rows, size_t cols, const T* x, T* y) {
+  using ComputeType = typename cuda::softmax::DefaultComputeType<T>::type;
+  oneflow::cuda::softmax::DirectLoad<T, ComputeType> load(x, cols);
+  oneflow::cuda::softmax::DirectStore<ComputeType, T> store(y, cols);
+  if (algorithm == Algorithm::kSoftmax) {
+    OF_CUDA_CHECK((cuda::softmax::DispatchSoftmax<decltype(load), decltype(store), ComputeType>(
+        cuda_stream, load, store, rows, cols)));
+  } else if (algorithm == Algorithm::kLogSoftmax) {
+    OF_CUDA_CHECK((cuda::softmax::DispatchLogSoftmax<decltype(load), decltype(store), ComputeType>(
+        cuda_stream, load, store, rows, cols)));
+  } else {
+    UNIMPLEMENTED();
+  }
+}
+
+template<typename SoftmaxBase, Algorithm algorithm, typename T>
+class SoftmaxImpl : public SoftmaxBase {
+ public:
+  OF_DISALLOW_COPY_AND_MOVE(SoftmaxImpl);
+  SoftmaxImpl() = default;
+  ~SoftmaxImpl() override = default;
+
+  void Launch(Stream* stream, size_t rows, size_t cols, const void* x, void* y) override {
+    hipStream_t cuda_stream = stream->As<CudaStream>()->cuda_stream();
+    SoftmaxGpu<algorithm, T>(cuda_stream, rows, cols, reinterpret_cast<const T*>(x),
+                             reinterpret_cast<T*>(y));
+  }
+};
+
+template<typename SoftmaxBase, Algorithm algorithm, typename T>
+std::unique_ptr<SoftmaxBase> NewSoftmax() {
+  return std::unique_ptr<SoftmaxBase>(new SoftmaxImpl<SoftmaxBase, algorithm, T>());
+}
+
+template<typename FactoryBase, typename SoftmaxBase, Algorithm algorithm>
+class GenericSoftmaxFactoryImpl : public FactoryBase {
+ public:
+  OF_DISALLOW_COPY_AND_MOVE(GenericSoftmaxFactoryImpl);
+  GenericSoftmaxFactoryImpl() = default;
+  ~GenericSoftmaxFactoryImpl() override = default;
+
+  std::unique_ptr<SoftmaxBase> New(DataType data_type) override {
+#define MAKE_NEW_SOFTMAX_ENTRY(type_cpp, type_proto) \
+  {type_proto, NewSoftmax<SoftmaxBase, algorithm, type_cpp>},
+
+    static const std::map<DataType, std::function<std::unique_ptr<SoftmaxBase>()>>
+        new_softmax_handle{
+            OF_PP_FOR_EACH_TUPLE(MAKE_NEW_SOFTMAX_ENTRY, CUDA_PRIMITIVE_FLOATING_TYPE_SEQ)};
+
+#undef MAKE_NEW_SOFTMAX_ENTRY
+
+    const auto it = new_softmax_handle.find(data_type);
+    if (it != new_softmax_handle.end()) {
+      return it->second();
+    } else {
+      return nullptr;
+    }
+  }
+};
+
+using SoftmaxFactoryImpl = GenericSoftmaxFactoryImpl<SoftmaxFactory, Softmax, Algorithm::kSoftmax>;
+using LogSoftmaxFactoryImpl =
+    GenericSoftmaxFactoryImpl<LogSoftmaxFactory, LogSoftmax, Algorithm::kLogSoftmax>;
+REGISTER_PRIMITIVE_FACTORY(DeviceType::kCUDA, SoftmaxFactory, SoftmaxFactoryImpl);
+REGISTER_PRIMITIVE_FACTORY(DeviceType::kCUDA, LogSoftmaxFactory, LogSoftmaxFactoryImpl);
+
+}  // namespace
+
+}  // namespace primitive
+}  // namespace ep
+
+}  // namespace oneflow
--- a/oneflow/core/ep/rocm/primitive/softmax_backward.hip.cpp
+++ b/oneflow/core/ep/rocm/primitive/softmax_backward.hip.cpp
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#include "oneflow/core/common/preprocessor.h"
-#include "oneflow/core/ep/include/primitive/softmax_backward.h"
-#include "oneflow/core/ep/include/primitive/log_softmax_backward.h"
-#include "oneflow/core/ep/rocm/primitive/type_seq.h"
-#include "oneflow/core/hip/softmax.hip.h"
-#include "oneflow/core/ep/rocm/cuda_stream.h"
-
-namespace oneflow {
-
-namespace ep {
-namespace primitive {
-
-namespace {
-
-enum class Algorithm {
-  kSoftmax,
-  kLogSoftmax,
-};
-
-template<Algorithm algorithm, typename T>
-void SoftmaxBackwardGpu(hipStream_t cuda_stream, size_t rows, size_t cols, const T* y, const T* dy,
-                        T* dx) {
-  using ComputeType = typename cuda::softmax::DefaultComputeType<T>::type;
-  cuda::softmax::DirectLoad<T, ComputeType> load_y(y, cols);
-  cuda::softmax::DirectLoad<T, ComputeType> load_dy(dy, cols);
-  cuda::softmax::DirectStore<ComputeType, T> store(dx, cols);
-  if (algorithm == Algorithm::kSoftmax) {
-    OF_CUDA_CHECK((cuda::softmax::DispatchSoftmaxGrad<decltype(load_y), decltype(load_dy),
-                                                      decltype(store), ComputeType>(
-        cuda_stream, load_y, load_dy, store, rows, cols)));
-  } else if (algorithm == Algorithm::kLogSoftmax) {
-    OF_CUDA_CHECK((cuda::softmax::DispatchLogSoftmaxGrad<decltype(load_y), decltype(load_dy),
-                                                         decltype(store), ComputeType>(
-        cuda_stream, load_y, load_dy, store, rows, cols)));
-  } else {
-    UNIMPLEMENTED();
-  }
-}
-
-template<typename SoftmaxBackwardBase, Algorithm algorithm, typename T>
-class SoftmaxBackwardImpl : public SoftmaxBackwardBase {
- public:
-  OF_DISALLOW_COPY_AND_MOVE(SoftmaxBackwardImpl);
-  SoftmaxBackwardImpl() = default;
-  ~SoftmaxBackwardImpl() override = default;
-
-  void Launch(Stream* stream, size_t rows, size_t cols, const void* y, const void* dy,
-              void* dx) override {
-    hipStream_t cuda_stream = stream->As<CudaStream>()->cuda_stream();
-    SoftmaxBackwardGpu<algorithm, T>(cuda_stream, rows, cols, reinterpret_cast<const T*>(y),
-                                     reinterpret_cast<const T*>(dy), reinterpret_cast<T*>(dx));
-  }
-};
-
-template<typename SoftmaxBackwardBase, Algorithm algorithm, typename T>
-std::unique_ptr<SoftmaxBackwardBase> NewSoftmaxBackward() {
-  return std::unique_ptr<SoftmaxBackwardBase>(
-      new SoftmaxBackwardImpl<SoftmaxBackwardBase, algorithm, T>());
-}
-
-template<typename BackwardFactoryBase, typename SoftmaxBackwardBase, Algorithm algorithm>
-class GenericSoftmaxBackwardFactoryImpl : public BackwardFactoryBase {
- public:
-  OF_DISALLOW_COPY_AND_MOVE(GenericSoftmaxBackwardFactoryImpl);
-  GenericSoftmaxBackwardFactoryImpl() = default;
-  ~GenericSoftmaxBackwardFactoryImpl() override = default;
-
-  std::unique_ptr<SoftmaxBackwardBase> New(DataType data_type) override {
-#define MAKE_NEW_SOFTMAX_ENTRY(type_cpp, type_proto) \
-  {type_proto, NewSoftmaxBackward<SoftmaxBackwardBase, algorithm, type_cpp>},
-
-    static const std::map<DataType, std::function<std::unique_ptr<SoftmaxBackwardBase>()>>
-        new_softmax_backward_handle{
-            OF_PP_FOR_EACH_TUPLE(MAKE_NEW_SOFTMAX_ENTRY, CUDA_PRIMITIVE_FLOATING_TYPE_SEQ)};
-
-#undef MAKE_NEW_SOFTMAX_ENTRY
-
-    const auto it = new_softmax_backward_handle.find(data_type);
-    if (it != new_softmax_backward_handle.end()) {
-      return it->second();
-    } else {
-      return nullptr;
-    }
-  }
-};
-
-using SoftmaxBackwardFactoryImpl =
-    GenericSoftmaxBackwardFactoryImpl<SoftmaxBackwardFactory, SoftmaxBackward, Algorithm::kSoftmax>;
-using LogSoftmaxBackwardFactoryImpl =
-    GenericSoftmaxBackwardFactoryImpl<LogSoftmaxBackwardFactory, LogSoftmaxBackward,
-                                      Algorithm::kLogSoftmax>;
-REGISTER_PRIMITIVE_FACTORY(DeviceType::kCUDA, SoftmaxBackwardFactory, SoftmaxBackwardFactoryImpl);
-REGISTER_PRIMITIVE_FACTORY(DeviceType::kCUDA, LogSoftmaxBackwardFactory,
-                           LogSoftmaxBackwardFactoryImpl);
-
-}  // namespace
-
-}  // namespace primitive
-}  // namespace ep
-
-}  // namespace oneflow
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "oneflow/core/common/preprocessor.h"
+#include "oneflow/core/ep/include/primitive/softmax_backward.h"
+#include "oneflow/core/ep/include/primitive/log_softmax_backward.h"
+#include "oneflow/core/ep/rocm/primitive/type_seq.h"
+#include "oneflow/core/hip/softmax.hip.h"
+#include "oneflow/core/ep/rocm/cuda_stream.h"
+
+namespace oneflow {
+
+namespace ep {
+namespace primitive {
+
+namespace {
+
+enum class Algorithm {
+  kSoftmax,
+  kLogSoftmax,
+};
+
+template<Algorithm algorithm, typename T>
+void SoftmaxBackwardGpu(hipStream_t cuda_stream, size_t rows, size_t cols, const T* y, const T* dy,
+                        T* dx) {
+  using ComputeType = typename cuda::softmax::DefaultComputeType<T>::type;
+  cuda::softmax::DirectLoad<T, ComputeType> load_y(y, cols);
+  cuda::softmax::DirectLoad<T, ComputeType> load_dy(dy, cols);
+  cuda::softmax::DirectStore<ComputeType, T> store(dx, cols);
+  if (algorithm == Algorithm::kSoftmax) {
+    OF_CUDA_CHECK((cuda::softmax::DispatchSoftmaxGrad<decltype(load_y), decltype(load_dy),
+                                                      decltype(store), ComputeType>(
+        cuda_stream, load_y, load_dy, store, rows, cols)));
+  } else if (algorithm == Algorithm::kLogSoftmax) {
+    OF_CUDA_CHECK((cuda::softmax::DispatchLogSoftmaxGrad<decltype(load_y), decltype(load_dy),
+                                                         decltype(store), ComputeType>(
+        cuda_stream, load_y, load_dy, store, rows, cols)));
+  } else {
+    UNIMPLEMENTED();
+  }
+}
+
+template<typename SoftmaxBackwardBase, Algorithm algorithm, typename T>
+class SoftmaxBackwardImpl : public SoftmaxBackwardBase {
+ public:
+  OF_DISALLOW_COPY_AND_MOVE(SoftmaxBackwardImpl);
+  SoftmaxBackwardImpl() = default;
+  ~SoftmaxBackwardImpl() override = default;
+
+  void Launch(Stream* stream, size_t rows, size_t cols, const void* y, const void* dy,
+              void* dx) override {
+    hipStream_t cuda_stream = stream->As<CudaStream>()->cuda_stream();
+    SoftmaxBackwardGpu<algorithm, T>(cuda_stream, rows, cols, reinterpret_cast<const T*>(y),
+                                     reinterpret_cast<const T*>(dy), reinterpret_cast<T*>(dx));
+  }
+};
+
+template<typename SoftmaxBackwardBase, Algorithm algorithm, typename T>
+std::unique_ptr<SoftmaxBackwardBase> NewSoftmaxBackward() {
+  return std::unique_ptr<SoftmaxBackwardBase>(
+      new SoftmaxBackwardImpl<SoftmaxBackwardBase, algorithm, T>());
+}
+
+template<typename BackwardFactoryBase, typename SoftmaxBackwardBase, Algorithm algorithm>
+class GenericSoftmaxBackwardFactoryImpl : public BackwardFactoryBase {
+ public:
+  OF_DISALLOW_COPY_AND_MOVE(GenericSoftmaxBackwardFactoryImpl);
+  GenericSoftmaxBackwardFactoryImpl() = default;
+  ~GenericSoftmaxBackwardFactoryImpl() override = default;
+
+  std::unique_ptr<SoftmaxBackwardBase> New(DataType data_type) override {
+#define MAKE_NEW_SOFTMAX_ENTRY(type_cpp, type_proto) \
+  {type_proto, NewSoftmaxBackward<SoftmaxBackwardBase, algorithm, type_cpp>},
+
+    static const std::map<DataType, std::function<std::unique_ptr<SoftmaxBackwardBase>()>>
+        new_softmax_backward_handle{
+            OF_PP_FOR_EACH_TUPLE(MAKE_NEW_SOFTMAX_ENTRY, CUDA_PRIMITIVE_FLOATING_TYPE_SEQ)};
+
+#undef MAKE_NEW_SOFTMAX_ENTRY
+
+    const auto it = new_softmax_backward_handle.find(data_type);
+    if (it != new_softmax_backward_handle.end()) {
+      return it->second();
+    } else {
+      return nullptr;
+    }
+  }
+};
+
+using SoftmaxBackwardFactoryImpl =
+    GenericSoftmaxBackwardFactoryImpl<SoftmaxBackwardFactory, SoftmaxBackward, Algorithm::kSoftmax>;
+using LogSoftmaxBackwardFactoryImpl =
+    GenericSoftmaxBackwardFactoryImpl<LogSoftmaxBackwardFactory, LogSoftmaxBackward,
+                                      Algorithm::kLogSoftmax>;
+REGISTER_PRIMITIVE_FACTORY(DeviceType::kCUDA, SoftmaxBackwardFactory, SoftmaxBackwardFactoryImpl);
+REGISTER_PRIMITIVE_FACTORY(DeviceType::kCUDA, LogSoftmaxBackwardFactory,
+                           LogSoftmaxBackwardFactoryImpl);
+
+}  // namespace
+
+}  // namespace primitive
+}  // namespace ep
+
+}  // namespace oneflow
--- a/oneflow/core/ep/rocm/primitive/type_seq.h
+++ b/oneflow/core/ep/rocm/primitive/type_seq.h
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#ifndef ONEFLOW_CORE_EP_CUDA_PRIMITIVE_TYPE_SEQ_H_
-#define ONEFLOW_CORE_EP_CUDA_PRIMITIVE_TYPE_SEQ_H_
-
-#include "oneflow/core/common/preprocessor.h"
-#include "oneflow/core/common/data_type.h"
-
-#ifdef WITH_ROCM
-#include <hip/hip_runtime.h>
-#include <hip/hip_fp16.h>
-
-// #if CUDA_VERSION >= 11000
-// #include <cuda_bf16.h>
-// #endif  // CUDA_VERSION >= 11000
-
-#define CUDA_PRIMITIVE_BOOL_TYPE_SEQ OF_PP_MAKE_TUPLE_SEQ(bool, DataType::kBool)
-#define CUDA_PRIMITIVE_CHAR_TYPE_SEQ OF_PP_MAKE_TUPLE_SEQ(char, DataType::kChar)
-#define CUDA_PRIMITIVE_INT8_TYPE_SEQ OF_PP_MAKE_TUPLE_SEQ(int8_t, DataType::kInt8)
-#define CUDA_PRIMITIVE_UINT8_TYPE_SEQ OF_PP_MAKE_TUPLE_SEQ(uint8_t, DataType::kUInt8)
-#define CUDA_PRIMITIVE_INT32_TYPE_SEQ OF_PP_MAKE_TUPLE_SEQ(int32_t, DataType::kInt32)
-#define CUDA_PRIMITIVE_UINT32_TYPE_SEQ OF_PP_MAKE_TUPLE_SEQ(uint32_t, DataType::kUInt32)
-#define CUDA_PRIMITIVE_INT64_TYPE_SEQ OF_PP_MAKE_TUPLE_SEQ(int64_t, DataType::kInt64)
-#define CUDA_PRIMITIVE_UINT64_TYPE_SEQ OF_PP_MAKE_TUPLE_SEQ(uint64_t, DataType::kUInt64)
-#define CUDA_PRIMITIVE_FLOAT_TYPE_SEQ OF_PP_MAKE_TUPLE_SEQ(float, DataType::kFloat)
-#define CUDA_PRIMITIVE_DOUBLE_TYPE_SEQ OF_PP_MAKE_TUPLE_SEQ(double, DataType::kDouble)
-#define CUDA_PRIMITIVE_FLOAT16_TYPE_SEQ OF_PP_MAKE_TUPLE_SEQ(half, DataType::kFloat16)
-
-// #if CUDA_VERSION >= 11000
-// #define CUDA_PRIMITIVE_BFLOAT16_TYPE_SEQ OF_PP_MAKE_TUPLE_SEQ(nv_bfloat16, DataType::kBFloat16)
-// #else
-#define CUDA_PRIMITIVE_BFLOAT16_TYPE_SEQ
-// #endif  // CUDA_VERSION >= 11000
-
-#define CUDA_PRIMITIVE_ALL_TYPE_SEQ \
-  CUDA_PRIMITIVE_BOOL_TYPE_SEQ      \
-  CUDA_PRIMITIVE_CHAR_TYPE_SEQ      \
-  CUDA_PRIMITIVE_INT8_TYPE_SEQ      \
-  CUDA_PRIMITIVE_UINT8_TYPE_SEQ     \
-  CUDA_PRIMITIVE_INT32_TYPE_SEQ     \
-  CUDA_PRIMITIVE_INT64_TYPE_SEQ     \
-  CUDA_PRIMITIVE_FLOAT_TYPE_SEQ     \
-  CUDA_PRIMITIVE_DOUBLE_TYPE_SEQ    \
-  CUDA_PRIMITIVE_FLOAT16_TYPE_SEQ   \
-  CUDA_PRIMITIVE_BFLOAT16_TYPE_SEQ
-
-#define CUDA_PRIMITIVE_FLOATING_TYPE_SEQ \
-  CUDA_PRIMITIVE_FLOAT_TYPE_SEQ          \
-  CUDA_PRIMITIVE_DOUBLE_TYPE_SEQ         \
-  CUDA_PRIMITIVE_FLOAT16_TYPE_SEQ        \
-  CUDA_PRIMITIVE_BFLOAT16_TYPE_SEQ
-
-#define UTIL_OPS_DATA_TYPE_SEQ    \
-  CUDA_PRIMITIVE_INT8_TYPE_SEQ    \
-  CUDA_PRIMITIVE_UINT8_TYPE_SEQ   \
-  CUDA_PRIMITIVE_INT32_TYPE_SEQ   \
-  CUDA_PRIMITIVE_INT64_TYPE_SEQ   \
-  CUDA_PRIMITIVE_FLOAT_TYPE_SEQ   \
-  CUDA_PRIMITIVE_DOUBLE_TYPE_SEQ  \
-  CUDA_PRIMITIVE_FLOAT16_TYPE_SEQ \
-  CUDA_PRIMITIVE_BFLOAT16_TYPE_SEQ
-
-#endif  // WITH_ROCM
-
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#ifndef ONEFLOW_CORE_EP_CUDA_PRIMITIVE_TYPE_SEQ_H_
+#define ONEFLOW_CORE_EP_CUDA_PRIMITIVE_TYPE_SEQ_H_
+
+#include "oneflow/core/common/preprocessor.h"
+#include "oneflow/core/common/data_type.h"
+
+#ifdef WITH_ROCM
+#include <hip/hip_runtime.h>
+#include <hip/hip_fp16.h>
+
+// #if CUDA_VERSION >= 11000
+// #include <cuda_bf16.h>
+// #endif  // CUDA_VERSION >= 11000
+
+#define CUDA_PRIMITIVE_BOOL_TYPE_SEQ OF_PP_MAKE_TUPLE_SEQ(bool, DataType::kBool)
+#define CUDA_PRIMITIVE_CHAR_TYPE_SEQ OF_PP_MAKE_TUPLE_SEQ(char, DataType::kChar)
+#define CUDA_PRIMITIVE_INT8_TYPE_SEQ OF_PP_MAKE_TUPLE_SEQ(int8_t, DataType::kInt8)
+#define CUDA_PRIMITIVE_UINT8_TYPE_SEQ OF_PP_MAKE_TUPLE_SEQ(uint8_t, DataType::kUInt8)
+#define CUDA_PRIMITIVE_INT32_TYPE_SEQ OF_PP_MAKE_TUPLE_SEQ(int32_t, DataType::kInt32)
+#define CUDA_PRIMITIVE_UINT32_TYPE_SEQ OF_PP_MAKE_TUPLE_SEQ(uint32_t, DataType::kUInt32)
+#define CUDA_PRIMITIVE_INT64_TYPE_SEQ OF_PP_MAKE_TUPLE_SEQ(int64_t, DataType::kInt64)
+#define CUDA_PRIMITIVE_UINT64_TYPE_SEQ OF_PP_MAKE_TUPLE_SEQ(uint64_t, DataType::kUInt64)
+#define CUDA_PRIMITIVE_FLOAT_TYPE_SEQ OF_PP_MAKE_TUPLE_SEQ(float, DataType::kFloat)
+#define CUDA_PRIMITIVE_DOUBLE_TYPE_SEQ OF_PP_MAKE_TUPLE_SEQ(double, DataType::kDouble)
+#define CUDA_PRIMITIVE_FLOAT16_TYPE_SEQ OF_PP_MAKE_TUPLE_SEQ(half, DataType::kFloat16)
+
+// #if CUDA_VERSION >= 11000
+// #define CUDA_PRIMITIVE_BFLOAT16_TYPE_SEQ OF_PP_MAKE_TUPLE_SEQ(nv_bfloat16, DataType::kBFloat16)
+// #else
+#define CUDA_PRIMITIVE_BFLOAT16_TYPE_SEQ
+// #endif  // CUDA_VERSION >= 11000
+
+#define CUDA_PRIMITIVE_ALL_TYPE_SEQ \
+  CUDA_PRIMITIVE_BOOL_TYPE_SEQ      \
+  CUDA_PRIMITIVE_CHAR_TYPE_SEQ      \
+  CUDA_PRIMITIVE_INT8_TYPE_SEQ      \
+  CUDA_PRIMITIVE_UINT8_TYPE_SEQ     \
+  CUDA_PRIMITIVE_INT32_TYPE_SEQ     \
+  CUDA_PRIMITIVE_INT64_TYPE_SEQ     \
+  CUDA_PRIMITIVE_FLOAT_TYPE_SEQ     \
+  CUDA_PRIMITIVE_DOUBLE_TYPE_SEQ    \
+  CUDA_PRIMITIVE_FLOAT16_TYPE_SEQ   \
+  CUDA_PRIMITIVE_BFLOAT16_TYPE_SEQ
+
+#define CUDA_PRIMITIVE_FLOATING_TYPE_SEQ \
+  CUDA_PRIMITIVE_FLOAT_TYPE_SEQ          \
+  CUDA_PRIMITIVE_DOUBLE_TYPE_SEQ         \
+  CUDA_PRIMITIVE_FLOAT16_TYPE_SEQ        \
+  CUDA_PRIMITIVE_BFLOAT16_TYPE_SEQ
+
+#define UTIL_OPS_DATA_TYPE_SEQ    \
+  CUDA_PRIMITIVE_INT8_TYPE_SEQ    \
+  CUDA_PRIMITIVE_UINT8_TYPE_SEQ   \
+  CUDA_PRIMITIVE_INT32_TYPE_SEQ   \
+  CUDA_PRIMITIVE_INT64_TYPE_SEQ   \
+  CUDA_PRIMITIVE_FLOAT_TYPE_SEQ   \
+  CUDA_PRIMITIVE_DOUBLE_TYPE_SEQ  \
+  CUDA_PRIMITIVE_FLOAT16_TYPE_SEQ \
+  CUDA_PRIMITIVE_BFLOAT16_TYPE_SEQ
+
+#endif  // WITH_ROCM
+
 #endif  // ONEFLOW_CORE_EP_CUDA_PRIMITIVE_TYPE_SEQ_H_
\ No newline at end of file
--- a/oneflow/core/ep/rocm/primitive/unary_functor.hip.h
+++ b/oneflow/core/ep/rocm/primitive/unary_functor.hip.h
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#include "oneflow/core/ep/common/primitive/unary_functor.h"
-#include "oneflow/core/ep/rocm/primitive/type_seq.h"
-#include "oneflow/core/hip/elementwise.hip.h"
-#include "oneflow/core/ep/rocm/cuda_stream.h"
-
-namespace oneflow {
-namespace ep {
-namespace primitive {
-
-template<typename Dst, typename Src>
-struct UnaryFunctor<DeviceType::kCUDA, UnaryOp::kGelu, Dst, Src> {
-  UnaryFunctor(Scalar attr0, Scalar attr1) {}
-
-  OF_DEVICE_FUNC Dst operator()(Src src) const {
-    return static_cast<Src>(0.5) * src
-           * (static_cast<Src>(1.0) + erf(static_cast<Src>(M_SQRT1_2) * src));
-  }
-};
-
-template<>
-struct UnaryFunctor<DeviceType::kCUDA, UnaryOp::kTanh, float, float> {
-  UnaryFunctor(Scalar attr0, Scalar attr1) {}
-
-  OF_DEVICE_FUNC float operator()(float src) const { return tanhf(src); }
-};
-
-template<>
-struct UnaryFunctor<DeviceType::kCUDA, UnaryOp::kTanh, double, double> {
-  UnaryFunctor(Scalar attr0, Scalar attr1) {}
-
-  OF_DEVICE_FUNC double operator()(double src) const { return tanh(src); }
-};
-
-template<>
-struct UnaryFunctor<DeviceType::kCUDA, UnaryOp::kTanh, half, half> {
-  UnaryFunctor(Scalar attr0, Scalar attr1) {}
-
-  OF_DEVICE_FUNC half operator()(half src) const { return __float2half(tanhf(__half2float(src))); }
-};
-
-template<>
-struct UnaryFunctor<DeviceType::kCUDA, UnaryOp::kIsInf, bool, half> {
-  UnaryFunctor(Scalar attr0, Scalar attr1) {}
-
-  OF_DEVICE_FUNC bool operator()(half src) const { return isinf(__half2float(src)); }
-};
-
-template<>
-struct UnaryFunctor<DeviceType::kCUDA, UnaryOp::kIsInf, bool, float> {
-  UnaryFunctor(Scalar attr0, Scalar attr1) {}
-
-  OF_DEVICE_FUNC bool operator()(float src) const { return isinf(src); }
-};
-
-template<>
-struct UnaryFunctor<DeviceType::kCUDA, UnaryOp::kIsInf, bool, double> {
-  UnaryFunctor(Scalar attr0, Scalar attr1) {}
-
-  OF_DEVICE_FUNC bool operator()(double src) const { return isinf(src); }
-};
-
-template<>
-struct UnaryFunctor<DeviceType::kCUDA, UnaryOp::kIsNan, bool, half> {
-  UnaryFunctor(Scalar attr0, Scalar attr1) {}
-
-  OF_DEVICE_FUNC bool operator()(half src) const { return isnan(__half2float(src)); }
-};
-
-template<>
-struct UnaryFunctor<DeviceType::kCUDA, UnaryOp::kIsNan, bool, float> {
-  UnaryFunctor(Scalar attr0, Scalar attr1) {}
-
-  OF_DEVICE_FUNC bool operator()(float src) const { return isnan(src); }
-};
-
-template<>
-struct UnaryFunctor<DeviceType::kCUDA, UnaryOp::kIsNan, bool, double> {
-  UnaryFunctor(Scalar attr0, Scalar attr1) {}
-
-  OF_DEVICE_FUNC bool operator()(double src) const { return isnan(src); }
-};
-
-#define SPECIALIZATION_PSEUDO_HALF_UNARY_FUNCTOR(op)                          \
-  template<>                                                                  \
-  struct UnaryFunctor<DeviceType::kCUDA, op, half, half> {                    \
-    UnaryFunctor(Scalar attr0, Scalar attr1) : float_functor(attr0, attr1) {} \
-                                                                              \
-    UnaryFunctor<DeviceType::kCUDA, op, float, float> float_functor;          \
-    OF_DEVICE_FUNC half operator()(half src) const {                          \
-      return __float2half(float_functor(__half2float(src)));                  \
-    }                                                                         \
-  };
-
-SPECIALIZATION_PSEUDO_HALF_UNARY_FUNCTOR(UnaryOp::kElu);
-SPECIALIZATION_PSEUDO_HALF_UNARY_FUNCTOR(UnaryOp::kCelu);
-SPECIALIZATION_PSEUDO_HALF_UNARY_FUNCTOR(UnaryOp::kGelu);
-SPECIALIZATION_PSEUDO_HALF_UNARY_FUNCTOR(UnaryOp::kMish);
-SPECIALIZATION_PSEUDO_HALF_UNARY_FUNCTOR(UnaryOp::kSelu);
-SPECIALIZATION_PSEUDO_HALF_UNARY_FUNCTOR(UnaryOp::kSilu);
-SPECIALIZATION_PSEUDO_HALF_UNARY_FUNCTOR(UnaryOp::kSoftSign);
-SPECIALIZATION_PSEUDO_HALF_UNARY_FUNCTOR(UnaryOp::kSoftPlus);
-
-// /*********nv_bfloat16_kernel*******/
-
-// #if CUDA_VERSION >= 11000
-
-// #define SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR(op)                      \
-//   template<>                                                                  \
-//   struct UnaryFunctor<DeviceType::kCUDA, op, nv_bfloat16, nv_bfloat16> {      \
-//     UnaryFunctor(Scalar attr0, Scalar attr1) : float_functor(attr0, attr1) {} \
-//                                                                               \
-//     UnaryFunctor<DeviceType::kCUDA, op, float, float> float_functor;          \
-//     OF_DEVICE_FUNC nv_bfloat16 operator()(nv_bfloat16 src) const {            \
-//       return __float2bfloat16(float_functor(__bfloat162float(src)));          \
-//     }                                                                         \
-//   };
-
-// SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR(UnaryOp::kElu);
-// SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR(UnaryOp::kCelu);
-// SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR(UnaryOp::kGelu);
-// SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR(UnaryOp::kHardSwish);
-// SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR(UnaryOp::kHardSigmoid);
-// SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR(UnaryOp::kHardShrink);
-// SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR(UnaryOp::kHardTanh);
-// SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR(UnaryOp::kLeakyRelu);
-// SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR(UnaryOp::kMish);
-// SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR(UnaryOp::kSelu);
-// SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR(UnaryOp::kSilu);
-// SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR(UnaryOp::kSoftShrink);
-// SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR(UnaryOp::kSoftSign);
-// SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR(UnaryOp::kSoftPlus);
-// SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR(UnaryOp::kTanh);
-// SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR(UnaryOp::kThreshold);
-
-// template<>
-// struct UnaryFunctor<DeviceType::kCUDA, UnaryOp::kIsInf, bool, nv_bfloat16> {
-//   UnaryFunctor(Scalar attr0, Scalar attr1) {}
-
-//   OF_DEVICE_FUNC bool operator()(nv_bfloat16 src) const { return isinf(__bfloat162float(src)); }
-// };
-
-// template<>
-// struct UnaryFunctor<DeviceType::kCUDA, UnaryOp::kIsNan, bool, nv_bfloat16> {
-//   UnaryFunctor(Scalar attr0, Scalar attr1) {}
-
-//   OF_DEVICE_FUNC bool operator()(nv_bfloat16 src) const { return isnan(__bfloat162float(src)); }
-// };
-
-// #endif
-
-}  // namespace primitive
-}  // namespace ep
-}  // namespace oneflow
-
-
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "oneflow/core/ep/common/primitive/unary_functor.h"
+#include "oneflow/core/ep/rocm/primitive/type_seq.h"
+#include "oneflow/core/hip/elementwise.hip.h"
+#include "oneflow/core/ep/rocm/cuda_stream.h"
+
+namespace oneflow {
+namespace ep {
+namespace primitive {
+
+template<typename Dst, typename Src>
+struct UnaryFunctor<DeviceType::kCUDA, UnaryOp::kGelu, Dst, Src> {
+  UnaryFunctor(Scalar attr0, Scalar attr1) {}
+
+  OF_DEVICE_FUNC Dst operator()(Src src) const {
+    return static_cast<Src>(0.5) * src
+           * (static_cast<Src>(1.0) + erf(static_cast<Src>(M_SQRT1_2) * src));
+  }
+};
+
+template<>
+struct UnaryFunctor<DeviceType::kCUDA, UnaryOp::kTanh, float, float> {
+  UnaryFunctor(Scalar attr0, Scalar attr1) {}
+
+  OF_DEVICE_FUNC float operator()(float src) const { return tanhf(src); }
+};
+
+template<>
+struct UnaryFunctor<DeviceType::kCUDA, UnaryOp::kTanh, double, double> {
+  UnaryFunctor(Scalar attr0, Scalar attr1) {}
+
+  OF_DEVICE_FUNC double operator()(double src) const { return tanh(src); }
+};
+
+template<>
+struct UnaryFunctor<DeviceType::kCUDA, UnaryOp::kTanh, half, half> {
+  UnaryFunctor(Scalar attr0, Scalar attr1) {}
+
+  OF_DEVICE_FUNC half operator()(half src) const { return __float2half(tanhf(__half2float(src))); }
+};
+
+template<>
+struct UnaryFunctor<DeviceType::kCUDA, UnaryOp::kIsInf, bool, half> {
+  UnaryFunctor(Scalar attr0, Scalar attr1) {}
+
+  OF_DEVICE_FUNC bool operator()(half src) const { return isinf(__half2float(src)); }
+};
+
+template<>
+struct UnaryFunctor<DeviceType::kCUDA, UnaryOp::kIsInf, bool, float> {
+  UnaryFunctor(Scalar attr0, Scalar attr1) {}
+
+  OF_DEVICE_FUNC bool operator()(float src) const { return isinf(src); }
+};
+
+template<>
+struct UnaryFunctor<DeviceType::kCUDA, UnaryOp::kIsInf, bool, double> {
+  UnaryFunctor(Scalar attr0, Scalar attr1) {}
+
+  OF_DEVICE_FUNC bool operator()(double src) const { return isinf(src); }
+};
+
+template<>
+struct UnaryFunctor<DeviceType::kCUDA, UnaryOp::kIsNan, bool, half> {
+  UnaryFunctor(Scalar attr0, Scalar attr1) {}
+
+  OF_DEVICE_FUNC bool operator()(half src) const { return isnan(__half2float(src)); }
+};
+
+template<>
+struct UnaryFunctor<DeviceType::kCUDA, UnaryOp::kIsNan, bool, float> {
+  UnaryFunctor(Scalar attr0, Scalar attr1) {}
+
+  OF_DEVICE_FUNC bool operator()(float src) const { return isnan(src); }
+};
+
+template<>
+struct UnaryFunctor<DeviceType::kCUDA, UnaryOp::kIsNan, bool, double> {
+  UnaryFunctor(Scalar attr0, Scalar attr1) {}
+
+  OF_DEVICE_FUNC bool operator()(double src) const { return isnan(src); }
+};
+
+#define SPECIALIZATION_PSEUDO_HALF_UNARY_FUNCTOR(op)                          \
+  template<>                                                                  \
+  struct UnaryFunctor<DeviceType::kCUDA, op, half, half> {                    \
+    UnaryFunctor(Scalar attr0, Scalar attr1) : float_functor(attr0, attr1) {} \
+                                                                              \
+    UnaryFunctor<DeviceType::kCUDA, op, float, float> float_functor;          \
+    OF_DEVICE_FUNC half operator()(half src) const {                          \
+      return __float2half(float_functor(__half2float(src)));                  \
+    }                                                                         \
+  };
+
+SPECIALIZATION_PSEUDO_HALF_UNARY_FUNCTOR(UnaryOp::kElu);
+SPECIALIZATION_PSEUDO_HALF_UNARY_FUNCTOR(UnaryOp::kCelu);
+SPECIALIZATION_PSEUDO_HALF_UNARY_FUNCTOR(UnaryOp::kGelu);
+SPECIALIZATION_PSEUDO_HALF_UNARY_FUNCTOR(UnaryOp::kMish);
+SPECIALIZATION_PSEUDO_HALF_UNARY_FUNCTOR(UnaryOp::kSelu);
+SPECIALIZATION_PSEUDO_HALF_UNARY_FUNCTOR(UnaryOp::kSilu);
+SPECIALIZATION_PSEUDO_HALF_UNARY_FUNCTOR(UnaryOp::kSoftSign);
+SPECIALIZATION_PSEUDO_HALF_UNARY_FUNCTOR(UnaryOp::kSoftPlus);
+
+// /*********nv_bfloat16_kernel*******/
+
+// #if CUDA_VERSION >= 11000
+
+// #define SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR(op)                      \
+//   template<>                                                                  \
+//   struct UnaryFunctor<DeviceType::kCUDA, op, nv_bfloat16, nv_bfloat16> {      \
+//     UnaryFunctor(Scalar attr0, Scalar attr1) : float_functor(attr0, attr1) {} \
+//                                                                               \
+//     UnaryFunctor<DeviceType::kCUDA, op, float, float> float_functor;          \
+//     OF_DEVICE_FUNC nv_bfloat16 operator()(nv_bfloat16 src) const {            \
+//       return __float2bfloat16(float_functor(__bfloat162float(src)));          \
+//     }                                                                         \
+//   };
+
+// SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR(UnaryOp::kElu);
+// SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR(UnaryOp::kCelu);
+// SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR(UnaryOp::kGelu);
+// SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR(UnaryOp::kHardSwish);
+// SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR(UnaryOp::kHardSigmoid);
+// SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR(UnaryOp::kHardShrink);
+// SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR(UnaryOp::kHardTanh);
+// SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR(UnaryOp::kLeakyRelu);
+// SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR(UnaryOp::kMish);
+// SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR(UnaryOp::kSelu);
+// SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR(UnaryOp::kSilu);
+// SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR(UnaryOp::kSoftShrink);
+// SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR(UnaryOp::kSoftSign);
+// SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR(UnaryOp::kSoftPlus);
+// SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR(UnaryOp::kTanh);
+// SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR(UnaryOp::kThreshold);
+
+// template<>
+// struct UnaryFunctor<DeviceType::kCUDA, UnaryOp::kIsInf, bool, nv_bfloat16> {
+//   UnaryFunctor(Scalar attr0, Scalar attr1) {}
+
+//   OF_DEVICE_FUNC bool operator()(nv_bfloat16 src) const { return isinf(__bfloat162float(src)); }
+// };
+
+// template<>
+// struct UnaryFunctor<DeviceType::kCUDA, UnaryOp::kIsNan, bool, nv_bfloat16> {
+//   UnaryFunctor(Scalar attr0, Scalar attr1) {}
+
+//   OF_DEVICE_FUNC bool operator()(nv_bfloat16 src) const { return isnan(__bfloat162float(src)); }
+// };
+
+// #endif
+
+}  // namespace primitive
+}  // namespace ep
+}  // namespace oneflow
+
+