0.9.1-rocm

a715222c · yuguo · f262efc9 · a715222c · a715222c · a715222c
Commit a715222c authored Feb 28, 2023 by yuguo
20 changed files
--- a/oneflow/core/ep/cuda/primitive/memcpy.cpp
+++ b/oneflow/core/ep/cuda/primitive/memcpy.cpp
@@ -60,3 +60,51 @@ REGISTER_PRIMITIVE_FACTORY(DeviceType::kCUDA, MemcpyFactory, MemcpyFactoryImpl);
 }  // namespace oneflow

 #endif
+
+#ifdef WITH_ROCM
+
+#include "oneflow/core/ep/include/primitive/memcpy.h"
+#include "oneflow/core/ep/cuda/cuda_stream.h"
+#include <hip/hip_runtime.h>
+
+namespace oneflow {
+
+namespace ep {
+namespace primitive {
+
+namespace {
+
+class MemcpyImpl : public Memcpy {
+ public:
+  OF_DISALLOW_COPY_AND_MOVE(MemcpyImpl);
+  MemcpyImpl() = default;
+  ~MemcpyImpl() override = default;
+
+  void Launch(Stream* stream, void* dst, const void* src, size_t count) override {
+    if (dst == src) { return; }
+    auto* cuda_stream = stream->As<CudaStream>();
+    OF_CUDA_CHECK(hipMemcpyAsync(dst, src, count, hipMemcpyDefault, cuda_stream->cuda_stream()));
+  }
+};
+
+class MemcpyFactoryImpl : public MemcpyFactory {
+ public:
+  OF_DISALLOW_COPY_AND_MOVE(MemcpyFactoryImpl);
+  MemcpyFactoryImpl() = default;
+  ~MemcpyFactoryImpl() override = default;
+
+  std::unique_ptr<Memcpy> New(MemcpyKind kind) override {
+    return std::unique_ptr<Memcpy>(new MemcpyImpl());
+  }
+};
+
+REGISTER_PRIMITIVE_FACTORY(DeviceType::kCUDA, MemcpyFactory, MemcpyFactoryImpl);
+
+}  // namespace
+
+}  // namespace primitive
+}  // namespace ep
+
+}  // namespace oneflow
+
+#endif
--- a/oneflow/core/ep/cuda/primitive/memset.cpp
+++ b/oneflow/core/ep/cuda/primitive/memset.cpp
@@ -57,3 +57,49 @@ REGISTER_PRIMITIVE_FACTORY(DeviceType::kCUDA, MemsetFactory, MemsetFactoryImpl);
 }  // namespace oneflow

 #endif
+
+#ifdef WITH_ROCM
+
+#include "oneflow/core/ep/include/primitive/memset.h"
+#include "oneflow/core/ep/cuda/cuda_stream.h"
+#include <hip/hip_runtime.h>
+
+namespace oneflow {
+
+namespace ep {
+namespace primitive {
+
+namespace {
+
+class MemsetImpl : public Memset {
+ public:
+  OF_DISALLOW_COPY_AND_MOVE(MemsetImpl);
+  MemsetImpl() = default;
+  ~MemsetImpl() override = default;
+
+  void Launch(Stream* stream, void* ptr, int value, size_t count) override {
+    auto* cuda_stream = stream->As<CudaStream>();
+    OF_CUDA_CHECK(hipMemsetAsync(ptr, value, count, cuda_stream->cuda_stream()));
+  }
+};
+
+class MemsetFactoryImpl : public MemsetFactory {
+ public:
+  OF_DISALLOW_COPY_AND_MOVE(MemsetFactoryImpl);
+  MemsetFactoryImpl() = default;
+  ~MemsetFactoryImpl() override = default;
+
+  std::unique_ptr<Memset> New() override { return std::unique_ptr<Memset>(new MemsetImpl()); }
+};
+
+REGISTER_PRIMITIVE_FACTORY(DeviceType::kCUDA, MemsetFactory, MemsetFactoryImpl);
+
+}  // namespace
+
+}  // namespace primitive
+}  // namespace ep
+
+}  // namespace oneflow
+
+#endif
+
--- a/oneflow/core/ep/cuda/primitive/permute.cu
+++ b/oneflow/core/ep/cuda/primitive/permute.cu
@@ -16,7 +16,11 @@ limitations under the License.
 #include "oneflow/core/ep/include/primitive/permute.h"
 #include "oneflow/core/ep/common/primitive/permute_impl.h"
 #include "oneflow/core/ep/cuda/cuda_stream.h"
+#ifdef WITH_ROCM
+#include <hip/hip_runtime.h>
+#else
 #include <cuda_runtime.h>
+#endif

 namespace oneflow {

@@ -192,7 +196,7 @@ __global__ void BatchTransposeMovement2Kernel(const void* src_ptr, void* dst_ptr
 }

 template<size_t num_dims, size_t movement_size, size_t tile_size, typename IndexType>
-void LaunchBatchTransposeKernel(cudaStream_t& cuda_stream,
+void LaunchBatchTransposeKernel(GPU(Stream_t)& cuda_stream,
                                const PermuteKernelParams<num_dims, IndexType>& params,
                                const IndexType& num_batches, const IndexType& rows,
                                const IndexType& cols) {
@@ -264,7 +268,7 @@ void LaunchKernel(Stream* stream, const int64_t* src_dims, const void* src, cons
                  void* dst, size_t count) {
  PermuteKernelParams<num_dims, IndexType> params =
      MakePermuteParams<num_dims, IndexType>(src_dims, src, permutation, dst, count);
-  cudaStream_t cuda_stream = stream->As<CudaStream>()->cuda_stream();
+  GPU(Stream_t) cuda_stream = stream->As<CudaStream>()->cuda_stream();

  if (num_dims == 2 || num_dims == 3) {
    IndexType num_batches;
@@ -281,10 +285,12 @@ void LaunchKernel(Stream* stream, const int64_t* src_dims, const void* src, cons
            cuda_stream, params, num_batches, rows, cols);
      }
    } else {
+      if (params.count == 0) { return; }
      PermuteKernel<num_dims, movement_size, IndexType>
          <<<BlocksNum4ThreadsNum(params.count), kCudaThreadsNumPerBlock, 0, cuda_stream>>>(params);
    }
  } else {
+    if (params.count == 0) { return; }
    PermuteKernel<num_dims, movement_size, IndexType>
        <<<BlocksNum4ThreadsNum(params.count), kCudaThreadsNumPerBlock, 0, cuda_stream>>>(params);
  }

--- a/oneflow/core/ep/cuda/primitive/softmax.cu
+++ b/oneflow/core/ep/cuda/primitive/softmax.cu
@@ -32,7 +32,7 @@ enum class Algorithm {
 };

 template<Algorithm algorithm, typename T>
-void SoftmaxGpu(cudaStream_t cuda_stream, size_t rows, size_t cols, const T* x, T* y) {
+void SoftmaxGpu(GPU(Stream_t) cuda_stream, size_t rows, size_t cols, const T* x, T* y) {
  using ComputeType = typename cuda::softmax::DefaultComputeType<T>::type;
  oneflow::cuda::softmax::DirectLoad<T, ComputeType> load(x, cols);
  oneflow::cuda::softmax::DirectStore<ComputeType, T> store(y, cols);
@@ -55,7 +55,7 @@ class SoftmaxImpl : public SoftmaxBase {
  ~SoftmaxImpl() override = default;

  void Launch(Stream* stream, size_t rows, size_t cols, const void* x, void* y) override {
-    cudaStream_t cuda_stream = stream->As<CudaStream>()->cuda_stream();
+    GPU(Stream_t) cuda_stream = stream->As<CudaStream>()->cuda_stream();
    SoftmaxGpu<algorithm, T>(cuda_stream, rows, cols, reinterpret_cast<const T*>(x),
                             reinterpret_cast<T*>(y));
  }

--- a/oneflow/core/ep/cuda/primitive/softmax_backward.cu
+++ b/oneflow/core/ep/cuda/primitive/softmax_backward.cu
@@ -32,7 +32,7 @@ enum class Algorithm {
 };

 template<Algorithm algorithm, typename T>
-void SoftmaxBackwardGpu(cudaStream_t cuda_stream, size_t rows, size_t cols, const T* y, const T* dy,
+void SoftmaxBackwardGpu(GPU(Stream_t) cuda_stream, size_t rows, size_t cols, const T* y, const T* dy,
                        T* dx) {
  using ComputeType = typename cuda::softmax::DefaultComputeType<T>::type;
  cuda::softmax::DirectLoad<T, ComputeType> load_y(y, cols);
@@ -60,7 +60,7 @@ class SoftmaxBackwardImpl : public SoftmaxBackwardBase {

  void Launch(Stream* stream, size_t rows, size_t cols, const void* y, const void* dy,
              void* dx) override {
-    cudaStream_t cuda_stream = stream->As<CudaStream>()->cuda_stream();
+    GPU(Stream_t) cuda_stream = stream->As<CudaStream>()->cuda_stream();
    SoftmaxBackwardGpu<algorithm, T>(cuda_stream, rows, cols, reinterpret_cast<const T*>(y),
                                     reinterpret_cast<const T*>(dy), reinterpret_cast<T*>(dx));
  }

--- a/oneflow/core/ep/cuda/primitive/tensor_fill.cu
+++ b/oneflow/core/ep/cuda/primitive/tensor_fill.cu
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "oneflow/core/ep/include/primitive/tensor_fill.h"
+#include "oneflow/core/ep/cuda/primitive/type_seq.h"
+#include "oneflow/core/ep/cuda/cuda_stream.h"
+
+namespace oneflow {
+
+namespace ep {
+namespace primitive {
+
+namespace {
+
+template<size_t size>
+using Storage = typename std::aligned_storage<size, size>::type;
+
+template<typename T, size_t pack>
+union Pack {
+  static constexpr size_t size = sizeof(T) * pack;
+  explicit __device__ __host__ Pack(const T value) {
+    static_assert(sizeof(Pack) == size, "");
+    static_assert(alignof(Pack) == size, "");
+#pragma unroll
+    for (size_t i = 0; i < pack; ++i) { elem[i] = value; }
+  }
+  T elem[pack];
+  Storage<size> storage;
+};
+
+template<typename T, size_t pack>
+__global__ void TensorFillGpu(T* dst, const T* value, size_t count) {
+  const size_t pack_count = count / pack;
+  const T fill_value = value[0];
+  Pack<T, pack> pack_value(fill_value);
+  auto* pack_dst = reinterpret_cast<decltype(pack_value.storage)*>(dst);
+  CUDA_1D_KERNEL_LOOP_T(size_t, i, pack_count) { pack_dst[i] = pack_value.storage; }
+  T* tail_dst = dst + pack_count * pack;
+  const size_t tail_count = count - pack_count * pack;
+  CUDA_1D_KERNEL_LOOP_T(size_t, i, tail_count) { tail_dst[i] = fill_value; }
+}
+
+template<typename T, size_t pack>
+typename std::enable_if<(pack != 0), void>::type LaunchPackTensorFill(GPU(Stream_t) stream, T* dst,
+                                                                      const T* value,
+                                                                      size_t count) {
+  TensorFillGpu<T, pack>
+      <<<BlocksNum4ThreadsNum(count), kCudaThreadsNumPerBlock, 0, stream>>>(dst, value, count);
+}
+
+template<typename T, size_t pack>
+typename std::enable_if<(pack == 0), void>::type LaunchPackTensorFill(GPU(Stream_t) stream, T* dst,
+                                                                      const T* value,
+                                                                      size_t count) {
+  LOG(FATAL) << "wrong alignment";
+}
+
+template<typename T>
+void LaunchTensorFill(GPU(Stream_t) stream, T* dst, const T* value, size_t count) {
+  auto uintptr = reinterpret_cast<std::uintptr_t>(dst);
+  if (uintptr % 16 == 0) {
+    LaunchPackTensorFill<T, 16 / sizeof(T)>(stream, dst, value, count);
+  } else if (uintptr % 8 == 0) {
+    LaunchPackTensorFill<T, 8 / sizeof(T)>(stream, dst, value, count);
+  } else if (uintptr % 4 == 0) {
+    LaunchPackTensorFill<T, 4 / sizeof(T)>(stream, dst, value, count);
+  } else if (uintptr % 2 == 0) {
+    LaunchPackTensorFill<T, 2 / sizeof(T)>(stream, dst, value, count);
+  } else {
+    LaunchPackTensorFill<T, 1 / sizeof(T)>(stream, dst, value, count);
+  }
+}
+
+template<typename T>
+class TensorFillImpl : public TensorFill {
+ public:
+  OF_DISALLOW_COPY_AND_MOVE(TensorFillImpl);
+  TensorFillImpl() = default;
+  ~TensorFillImpl() override = default;
+
+  void Launch(Stream* stream, const void* src, void* dst, size_t count) override {
+    GPU(Stream_t) cuda_stream = stream->As<CudaStream>()->cuda_stream();
+    const T* value = reinterpret_cast<const T*>(src);
+    LaunchTensorFill<T>(cuda_stream, reinterpret_cast<T*>(dst), value, count);
+  }
+};
+
+template<typename T>
+std::unique_ptr<TensorFill> NewTensorFill() {
+  return std::unique_ptr<TensorFill>(new TensorFillImpl<T>());
+}
+
+class TensorFillFactoryImpl : public TensorFillFactory {
+ public:
+  OF_DISALLOW_COPY_AND_MOVE(TensorFillFactoryImpl);
+  TensorFillFactoryImpl() = default;
+  ~TensorFillFactoryImpl() override = default;
+
+  std::unique_ptr<TensorFill> New(DataType data_type) override {
+#define MAKE_NEW_TENSOR_FILL_ENTRY(type_cpp, type_proto) {type_proto, NewTensorFill<type_cpp>},
+
+    static const std::map<DataType, std::function<std::unique_ptr<TensorFill>()>> new_fill_handle{
+        OF_PP_FOR_EACH_TUPLE(MAKE_NEW_TENSOR_FILL_ENTRY, CUDA_PRIMITIVE_ALL_TYPE_SEQ)};
+
+#undef MAKE_NEW_TENSOR_FILL_ENTRY
+
+    const auto it = new_fill_handle.find(data_type);
+    if (it != new_fill_handle.end()) {
+      return it->second();
+    } else {
+      return nullptr;
+    }
+  }
+};
+
+REGISTER_PRIMITIVE_FACTORY(DeviceType::kCUDA, TensorFillFactory, TensorFillFactoryImpl);
+
+}  // namespace
+
+}  // namespace primitive
+}  // namespace ep
+
+}  // namespace oneflow
--- a/oneflow/core/ep/cuda/primitive/type_seq.h
+++ b/oneflow/core/ep/cuda/primitive/type_seq.h
@@ -63,6 +63,12 @@ limitations under the License.
  CUDA_PRIMITIVE_FLOAT16_TYPE_SEQ        \
  CUDA_PRIMITIVE_BFLOAT16_TYPE_SEQ

+#define CUDA_PRIMITIVE_INT_TYPE_SEQ \
+  CUDA_PRIMITIVE_UINT8_TYPE_SEQ     \
+  CUDA_PRIMITIVE_INT8_TYPE_SEQ      \
+  CUDA_PRIMITIVE_INT32_TYPE_SEQ     \
+  CUDA_PRIMITIVE_INT64_TYPE_SEQ
+
 #define UTIL_OPS_DATA_TYPE_SEQ    \
  CUDA_PRIMITIVE_INT8_TYPE_SEQ    \
  CUDA_PRIMITIVE_UINT8_TYPE_SEQ   \
@@ -75,4 +81,66 @@ limitations under the License.

 #endif  // WITH_CUDA

+#ifdef WITH_ROCM
+#include <hip/hip_runtime.h>
+#include <hip/hip_fp16.h>
+
+// #if CUDA_VERSION >= 11000
+// #include <cuda_bf16.h>
+// #endif  // CUDA_VERSION >= 11000
+
+#define CUDA_PRIMITIVE_BOOL_TYPE_SEQ OF_PP_MAKE_TUPLE_SEQ(bool, DataType::kBool)
+#define CUDA_PRIMITIVE_CHAR_TYPE_SEQ OF_PP_MAKE_TUPLE_SEQ(char, DataType::kChar)
+#define CUDA_PRIMITIVE_INT8_TYPE_SEQ OF_PP_MAKE_TUPLE_SEQ(int8_t, DataType::kInt8)
+#define CUDA_PRIMITIVE_UINT8_TYPE_SEQ OF_PP_MAKE_TUPLE_SEQ(uint8_t, DataType::kUInt8)
+#define CUDA_PRIMITIVE_INT32_TYPE_SEQ OF_PP_MAKE_TUPLE_SEQ(int32_t, DataType::kInt32)
+#define CUDA_PRIMITIVE_UINT32_TYPE_SEQ OF_PP_MAKE_TUPLE_SEQ(uint32_t, DataType::kUInt32)
+#define CUDA_PRIMITIVE_INT64_TYPE_SEQ OF_PP_MAKE_TUPLE_SEQ(int64_t, DataType::kInt64)
+#define CUDA_PRIMITIVE_UINT64_TYPE_SEQ OF_PP_MAKE_TUPLE_SEQ(uint64_t, DataType::kUInt64)
+#define CUDA_PRIMITIVE_FLOAT_TYPE_SEQ OF_PP_MAKE_TUPLE_SEQ(float, DataType::kFloat)
+#define CUDA_PRIMITIVE_DOUBLE_TYPE_SEQ OF_PP_MAKE_TUPLE_SEQ(double, DataType::kDouble)
+#define CUDA_PRIMITIVE_FLOAT16_TYPE_SEQ OF_PP_MAKE_TUPLE_SEQ(half, DataType::kFloat16)
+
+// #if CUDA_VERSION >= 11000
+// #define CUDA_PRIMITIVE_BFLOAT16_TYPE_SEQ OF_PP_MAKE_TUPLE_SEQ(nv_bfloat16, DataType::kBFloat16)
+// #else
+#define CUDA_PRIMITIVE_BFLOAT16_TYPE_SEQ
+// #endif  // CUDA_VERSION >= 11000
+
+#define CUDA_PRIMITIVE_ALL_TYPE_SEQ \
+  CUDA_PRIMITIVE_BOOL_TYPE_SEQ      \
+  CUDA_PRIMITIVE_CHAR_TYPE_SEQ      \
+  CUDA_PRIMITIVE_INT8_TYPE_SEQ      \
+  CUDA_PRIMITIVE_UINT8_TYPE_SEQ     \
+  CUDA_PRIMITIVE_INT32_TYPE_SEQ     \
+  CUDA_PRIMITIVE_INT64_TYPE_SEQ     \
+  CUDA_PRIMITIVE_FLOAT_TYPE_SEQ     \
+  CUDA_PRIMITIVE_DOUBLE_TYPE_SEQ    \
+  CUDA_PRIMITIVE_FLOAT16_TYPE_SEQ   \
+  CUDA_PRIMITIVE_BFLOAT16_TYPE_SEQ
+
+#define CUDA_PRIMITIVE_FLOATING_TYPE_SEQ \
+  CUDA_PRIMITIVE_FLOAT_TYPE_SEQ          \
+  CUDA_PRIMITIVE_DOUBLE_TYPE_SEQ         \
+  CUDA_PRIMITIVE_FLOAT16_TYPE_SEQ        \
+  CUDA_PRIMITIVE_BFLOAT16_TYPE_SEQ
+
+#define CUDA_PRIMITIVE_INT_TYPE_SEQ \
+  CUDA_PRIMITIVE_UINT8_TYPE_SEQ     \
+  CUDA_PRIMITIVE_INT8_TYPE_SEQ      \
+  CUDA_PRIMITIVE_INT32_TYPE_SEQ     \
+  CUDA_PRIMITIVE_INT64_TYPE_SEQ
+
+#define UTIL_OPS_DATA_TYPE_SEQ    \
+  CUDA_PRIMITIVE_INT8_TYPE_SEQ    \
+  CUDA_PRIMITIVE_UINT8_TYPE_SEQ   \
+  CUDA_PRIMITIVE_INT32_TYPE_SEQ   \
+  CUDA_PRIMITIVE_INT64_TYPE_SEQ   \
+  CUDA_PRIMITIVE_FLOAT_TYPE_SEQ   \
+  CUDA_PRIMITIVE_DOUBLE_TYPE_SEQ  \
+  CUDA_PRIMITIVE_FLOAT16_TYPE_SEQ \
+  CUDA_PRIMITIVE_BFLOAT16_TYPE_SEQ
+
+#endif  // WITH_ROCM
+
 #endif  // ONEFLOW_CORE_EP_CUDA_PRIMITIVE_TYPE_SEQ_H_
--- a/oneflow/core/ep/cuda/primitive/unary_functor.cuh
+++ b/oneflow/core/ep/cuda/primitive/unary_functor.cuh
@@ -17,14 +17,19 @@ limitations under the License.
 #include "oneflow/core/ep/cuda/primitive/type_seq.h"
 #include "oneflow/core/cuda/elementwise.cuh"
 #include "oneflow/core/ep/cuda/cuda_stream.h"
-
+#ifdef WITH_ROCM
+#include <hip/hip_runtime.h>
+#include <hip/hip_fp16.h>
+#else
+#include <cuda.h>
+#endif
 namespace oneflow {
 namespace ep {
 namespace primitive {

 template<typename Dst, typename Src>
 struct UnaryFunctor<DeviceType::kCUDA, UnaryOp::kGelu, Dst, Src> {
-  UnaryFunctor(Scalar attr0, Scalar attr1) {}
+  OF_DEVICE_FUNC UnaryFunctor(Scalar attr0, Scalar attr1) {}

  OF_DEVICE_FUNC Dst operator()(Src src) const {
    return static_cast<Src>(0.5) * src
@@ -32,78 +37,236 @@ struct UnaryFunctor<DeviceType::kCUDA, UnaryOp::kGelu, Dst, Src> {
  }
 };

+template<typename Dst, typename Src>
+struct UnaryFunctor<DeviceType::kCUDA, UnaryOp::kFastGelu, Dst, Src> {
+  OF_DEVICE_FUNC UnaryFunctor(Scalar attr0, Scalar attr1) {}
+
+  OF_DEVICE_FUNC Dst operator()(Src src) const {
+    // ref to: https://mlfromscratch.com/activation-functions-explained/#gelu
+    const Src half = static_cast<Src>(0.5);
+    const Src one = static_cast<Src>(1);
+    const Src tanh_in = alpha * (src + beta * src * src * src);
+    return half * src * (one + tanh(tanh_in));
+  }
+
+ private:
+  // constant ref to:
+  // https://github.com/microsoft/onnxruntime/blob/main/onnxruntime/test/testdata/transform/fusion/fast_gelu.py
+  static constexpr Src alpha = static_cast<Src>(0.7978845608028654);
+  static constexpr Src beta = static_cast<Src>(0.044714998453855515);
+};
+
+template<typename Dst, typename Src>
+struct UnaryFunctor<DeviceType::kCUDA, UnaryOp::kQuickGelu, Dst, Src> {
+  OF_DEVICE_FUNC UnaryFunctor(Scalar attr0, Scalar attr1) {}
+
+  OF_DEVICE_FUNC Dst operator()(Src src) const {
+    const Src sigmoid =
+        static_cast<Dst>(static_cast<Src>(1.0) / (static_cast<Src>(1.0) + exp(-src * alpha)));
+    return src * sigmoid;
+  }
+
+ private:
+  static constexpr Src alpha = static_cast<Src>(1.702);
+};
+
+namespace unary_functor_internal {
+
+namespace {
+
+OF_DEVICE_FUNC
+float TanhApprox(float x) {
+#if (__CUDA_ARCH__ >= 750 && CUDA_VERSION >= 11000)
+  float r;
+  asm("tanh.approx.f32 %0,%1; \n\t" : "=f"(r) : "f"(x));
+  return r;
+#else
+  return tanhf(x);
+#endif  // (__CUDA_ARCH__ >= 750 && CUDA_VERSION >= 11000)
+}
+
+}  // namespace
+
+}  // namespace unary_functor_internal
+
+template<>
+struct UnaryFunctor<DeviceType::kCUDA, UnaryOp::kFastGelu, half, half> {
+  OF_DEVICE_FUNC UnaryFunctor(Scalar attr0, Scalar attr1) : float_functor(attr0, attr1) {}
+
+  OF_DEVICE_FUNC half operator()(half src) const {
+#if (__CUDA_ARCH__ >= 750 && CUDA_VERSION >= 11000)
+    const float tanh_in =
+        __half2float(__float2half_rn(alpha) * (src + __float2half_rn(beta) * src * src * src));
+    const float tanh_out = unary_functor_internal::TanhApprox(tanh_in);
+    return __float2half_rn(0.5F) * src * (__float2half_rn(1.0F) + __float2half_rn(tanh_out));
+#else
+    return static_cast<half>(float_functor(static_cast<float>(src)));
+#endif  // (__CUDA_ARCH__ >= 750 && CUDA_VERSION >= 11000)
+  }
+
+#if (__CUDA_ARCH__ >= 750 && CUDA_VERSION >= 11000)
+  __device__ void Apply2(half* dst, const half* src) const {
+    const half2 src2 = *(reinterpret_cast<const half2*>(src));
+    const float2 tanh_in = __half22float2(__hmul2(
+        __float2half2_rn(alpha),
+        __hadd2(src2, __hmul2(__hmul2(__hmul2(__float2half2_rn(beta), src2), src2), src2))));
+    float2 tanh_out;
+    tanh_out.x = unary_functor_internal::TanhApprox(tanh_in.x);
+    tanh_out.y = unary_functor_internal::TanhApprox(tanh_in.y);
+    const half2 dst2 = __hmul2(__hmul2(__float2half2_rn(0.5F), src2),
+                               __hadd2(__float2half2_rn(1.0F), __float22half2_rn(tanh_out)));
+    *reinterpret_cast<half2*>(dst) = dst2;
+  }
+#endif  // (__CUDA_ARCH__ >= 750 && CUDA_VERSION >= 11000)
+
+ private:
+  static constexpr float alpha = 0.7978845608028654F;
+  static constexpr float beta = 0.044714998453855515F;
+  UnaryFunctor<DeviceType::kCUDA, UnaryOp::kFastGelu, float, float> float_functor;
+};
+
 template<>
 struct UnaryFunctor<DeviceType::kCUDA, UnaryOp::kTanh, float, float> {
-  UnaryFunctor(Scalar attr0, Scalar attr1) {}
+  OF_DEVICE_FUNC UnaryFunctor(Scalar attr0, Scalar attr1) {}

  OF_DEVICE_FUNC float operator()(float src) const { return tanhf(src); }
 };

 template<>
 struct UnaryFunctor<DeviceType::kCUDA, UnaryOp::kTanh, double, double> {
-  UnaryFunctor(Scalar attr0, Scalar attr1) {}
+  OF_DEVICE_FUNC UnaryFunctor(Scalar attr0, Scalar attr1) {}

  OF_DEVICE_FUNC double operator()(double src) const { return tanh(src); }
 };

 template<>
 struct UnaryFunctor<DeviceType::kCUDA, UnaryOp::kTanh, half, half> {
-  UnaryFunctor(Scalar attr0, Scalar attr1) {}
+  OF_DEVICE_FUNC UnaryFunctor(Scalar attr0, Scalar attr1) {}

  OF_DEVICE_FUNC half operator()(half src) const { return __float2half(tanhf(__half2float(src))); }
 };

 template<>
 struct UnaryFunctor<DeviceType::kCUDA, UnaryOp::kIsInf, bool, half> {
-  UnaryFunctor(Scalar attr0, Scalar attr1) {}
+  OF_DEVICE_FUNC UnaryFunctor(Scalar attr0, Scalar attr1) {}

  OF_DEVICE_FUNC bool operator()(half src) const { return isinf(__half2float(src)); }
 };

 template<>
 struct UnaryFunctor<DeviceType::kCUDA, UnaryOp::kIsInf, bool, float> {
-  UnaryFunctor(Scalar attr0, Scalar attr1) {}
+  OF_DEVICE_FUNC UnaryFunctor(Scalar attr0, Scalar attr1) {}

  OF_DEVICE_FUNC bool operator()(float src) const { return isinf(src); }
 };

 template<>
 struct UnaryFunctor<DeviceType::kCUDA, UnaryOp::kIsInf, bool, double> {
-  UnaryFunctor(Scalar attr0, Scalar attr1) {}
+  OF_DEVICE_FUNC UnaryFunctor(Scalar attr0, Scalar attr1) {}

  OF_DEVICE_FUNC bool operator()(double src) const { return isinf(src); }
 };

 template<>
 struct UnaryFunctor<DeviceType::kCUDA, UnaryOp::kIsNan, bool, half> {
-  UnaryFunctor(Scalar attr0, Scalar attr1) {}
+  OF_DEVICE_FUNC UnaryFunctor(Scalar attr0, Scalar attr1) {}

  OF_DEVICE_FUNC bool operator()(half src) const { return isnan(__half2float(src)); }
 };

 template<>
 struct UnaryFunctor<DeviceType::kCUDA, UnaryOp::kIsNan, bool, float> {
-  UnaryFunctor(Scalar attr0, Scalar attr1) {}
+  OF_DEVICE_FUNC UnaryFunctor(Scalar attr0, Scalar attr1) {}

  OF_DEVICE_FUNC bool operator()(float src) const { return isnan(src); }
 };

 template<>
 struct UnaryFunctor<DeviceType::kCUDA, UnaryOp::kIsNan, bool, double> {
-  UnaryFunctor(Scalar attr0, Scalar attr1) {}
+  OF_DEVICE_FUNC UnaryFunctor(Scalar attr0, Scalar attr1) {}

  OF_DEVICE_FUNC bool operator()(double src) const { return isnan(src); }
 };

-#define SPECIALIZATION_PSEUDO_HALF_UNARY_FUNCTOR(op)                          \
-  template<>                                                                  \
-  struct UnaryFunctor<DeviceType::kCUDA, op, half, half> {                    \
-    UnaryFunctor(Scalar attr0, Scalar attr1) : float_functor(attr0, attr1) {} \
-                                                                              \
-    UnaryFunctor<DeviceType::kCUDA, op, float, float> float_functor;          \
-    OF_DEVICE_FUNC half operator()(half src) const {                          \
-      return __float2half(float_functor(__half2float(src)));                  \
-    }                                                                         \
+template<>
+struct UnaryFunctor<DeviceType::kCUDA, UnaryOp::kIsFinite, bool, half> {
+  OF_DEVICE_FUNC UnaryFunctor(Scalar attr0, Scalar attr1) {}
+
+  OF_DEVICE_FUNC bool operator()(half src) const { return isfinite(__half2float(src)); }
+};
+
+template<>
+struct UnaryFunctor<DeviceType::kCUDA, UnaryOp::kIsFinite, bool, float> {
+  OF_DEVICE_FUNC UnaryFunctor(Scalar attr0, Scalar attr1) {}
+
+  OF_DEVICE_FUNC bool operator()(float src) const { return isfinite(src); }
+};
+
+template<>
+struct UnaryFunctor<DeviceType::kCUDA, UnaryOp::kIsFinite, bool, double> {
+  OF_DEVICE_FUNC UnaryFunctor(Scalar attr0, Scalar attr1) {}
+
+  OF_DEVICE_FUNC bool operator()(double src) const { return isfinite(src); }
+};
+
+template<>
+struct UnaryFunctor<DeviceType::kCUDA, UnaryOp::kTrunc, half, half> {
+  OF_DEVICE_FUNC UnaryFunctor(Scalar attr0, Scalar attr1) {}
+  __device__ half operator()(half src) const { return htrunc(src); }
+};
+
+template<>
+struct UnaryFunctor<DeviceType::kCUDA, UnaryOp::kTrunc, float, float> {
+  OF_DEVICE_FUNC UnaryFunctor(Scalar attr0, Scalar attr1) {}
+  OF_DEVICE_FUNC float operator()(float src) const { return truncf(src); }
+};
+
+template<>
+struct UnaryFunctor<DeviceType::kCUDA, UnaryOp::kTrunc, double, double> {
+  OF_DEVICE_FUNC UnaryFunctor(Scalar attr0, Scalar attr1) {}
+  OF_DEVICE_FUNC double operator()(double src) const { return trunc(src); }
+};
+
+template<>
+struct UnaryFunctor<DeviceType::kCUDA, UnaryOp::kAbs, half, half> {
+  OF_DEVICE_FUNC UnaryFunctor(Scalar attr0, Scalar attr1) {}
+
+  __device__ half operator()(half src) const {
+    return __hlt(src, static_cast<half>(0)) ? __hneg(src) : src;
+  }
+};
+
+template<typename Dst, typename Src>
+struct UnaryFunctor<DeviceType::kCUDA, UnaryOp::kNanAssign, Dst, Src> {
+  OF_DEVICE_FUNC UnaryFunctor(Scalar attr0, Scalar attr1) {}
+
+  OF_DEVICE_FUNC Dst operator()(Src src) const { return isnan(src) ? static_cast<Dst>(0.0) : src; }
+};
+
+#if CUDA_VERSION >= 11000
+template<>
+struct UnaryFunctor<DeviceType::kCUDA, UnaryOp::kAbs, nv_bfloat16, nv_bfloat16> {
+  OF_DEVICE_FUNC UnaryFunctor(Scalar attr0, Scalar attr1) {}
+
+  __device__ nv_bfloat16 operator()(nv_bfloat16 src) const {
+#if CUDA_ARCH >= 800
+    return __habs(src);
+#else
+    return __float2bfloat16(abs(__bfloat162float(src)));
+#endif  // CUDA_ARCH >= 800
+  }
+};
+#endif  // CUDA_VERSION >= 11000
+
+#define SPECIALIZATION_PSEUDO_HALF_UNARY_FUNCTOR(op)                                         \
+  template<>                                                                                 \
+  struct UnaryFunctor<DeviceType::kCUDA, op, half, half> {                                   \
+    OF_DEVICE_FUNC UnaryFunctor(Scalar attr0, Scalar attr1) : float_functor(attr0, attr1) {} \
+                                                                                             \
+    UnaryFunctor<DeviceType::kCUDA, op, float, float> float_functor;                         \
+    OF_DEVICE_FUNC half operator()(half src) const {                                         \
+      return __float2half(float_functor(__half2float(src)));                                 \
+    }                                                                                        \
  };

 SPECIALIZATION_PSEUDO_HALF_UNARY_FUNCTOR(UnaryOp::kElu);
@@ -114,20 +277,53 @@ SPECIALIZATION_PSEUDO_HALF_UNARY_FUNCTOR(UnaryOp::kSelu);
 SPECIALIZATION_PSEUDO_HALF_UNARY_FUNCTOR(UnaryOp::kSilu);
 SPECIALIZATION_PSEUDO_HALF_UNARY_FUNCTOR(UnaryOp::kSoftSign);
 SPECIALIZATION_PSEUDO_HALF_UNARY_FUNCTOR(UnaryOp::kSoftPlus);
+SPECIALIZATION_PSEUDO_HALF_UNARY_FUNCTOR(UnaryOp::kAcos);
+SPECIALIZATION_PSEUDO_HALF_UNARY_FUNCTOR(UnaryOp::kAcosh);
+SPECIALIZATION_PSEUDO_HALF_UNARY_FUNCTOR(UnaryOp::kAsin);
+SPECIALIZATION_PSEUDO_HALF_UNARY_FUNCTOR(UnaryOp::kAsinh);
+SPECIALIZATION_PSEUDO_HALF_UNARY_FUNCTOR(UnaryOp::kAtan);
+SPECIALIZATION_PSEUDO_HALF_UNARY_FUNCTOR(UnaryOp::kAtanh);
+SPECIALIZATION_PSEUDO_HALF_UNARY_FUNCTOR(UnaryOp::kCeil);
+SPECIALIZATION_PSEUDO_HALF_UNARY_FUNCTOR(UnaryOp::kCos);
+SPECIALIZATION_PSEUDO_HALF_UNARY_FUNCTOR(UnaryOp::kCosh);
+SPECIALIZATION_PSEUDO_HALF_UNARY_FUNCTOR(UnaryOp::kErf);
+SPECIALIZATION_PSEUDO_HALF_UNARY_FUNCTOR(UnaryOp::kErfc);
+SPECIALIZATION_PSEUDO_HALF_UNARY_FUNCTOR(UnaryOp::kExp);
+SPECIALIZATION_PSEUDO_HALF_UNARY_FUNCTOR(UnaryOp::kExpm1);
+SPECIALIZATION_PSEUDO_HALF_UNARY_FUNCTOR(UnaryOp::kFloor);
+SPECIALIZATION_PSEUDO_HALF_UNARY_FUNCTOR(UnaryOp::kLgamma);
+SPECIALIZATION_PSEUDO_HALF_UNARY_FUNCTOR(UnaryOp::kLog);
+SPECIALIZATION_PSEUDO_HALF_UNARY_FUNCTOR(UnaryOp::kLog2);
+SPECIALIZATION_PSEUDO_HALF_UNARY_FUNCTOR(UnaryOp::kLog10);
+SPECIALIZATION_PSEUDO_HALF_UNARY_FUNCTOR(UnaryOp::kLog1p);
+SPECIALIZATION_PSEUDO_HALF_UNARY_FUNCTOR(UnaryOp::kLogSigmoid);
+SPECIALIZATION_PSEUDO_HALF_UNARY_FUNCTOR(UnaryOp::kRint);
+SPECIALIZATION_PSEUDO_HALF_UNARY_FUNCTOR(UnaryOp::kRound);
+SPECIALIZATION_PSEUDO_HALF_UNARY_FUNCTOR(UnaryOp::kRsqrt);
+SPECIALIZATION_PSEUDO_HALF_UNARY_FUNCTOR(UnaryOp::kSigmoid);
+SPECIALIZATION_PSEUDO_HALF_UNARY_FUNCTOR(UnaryOp::kSin);
+SPECIALIZATION_PSEUDO_HALF_UNARY_FUNCTOR(UnaryOp::kSinh);
+SPECIALIZATION_PSEUDO_HALF_UNARY_FUNCTOR(UnaryOp::kSqrt);
+SPECIALIZATION_PSEUDO_HALF_UNARY_FUNCTOR(UnaryOp::kSquare);
+SPECIALIZATION_PSEUDO_HALF_UNARY_FUNCTOR(UnaryOp::kTan);
+SPECIALIZATION_PSEUDO_HALF_UNARY_FUNCTOR(UnaryOp::kReciprocalNoNan);
+SPECIALIZATION_PSEUDO_HALF_UNARY_FUNCTOR(UnaryOp::kNotEqualZero);
+SPECIALIZATION_PSEUDO_HALF_UNARY_FUNCTOR(UnaryOp::kNanAssign);
+SPECIALIZATION_PSEUDO_HALF_UNARY_FUNCTOR(UnaryOp::kQuickGelu);

 /*********nv_bfloat16_kernel*******/

 #if CUDA_VERSION >= 11000

-#define SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR(op)                      \
-  template<>                                                                  \
-  struct UnaryFunctor<DeviceType::kCUDA, op, nv_bfloat16, nv_bfloat16> {      \
-    UnaryFunctor(Scalar attr0, Scalar attr1) : float_functor(attr0, attr1) {} \
-                                                                              \
-    UnaryFunctor<DeviceType::kCUDA, op, float, float> float_functor;          \
-    OF_DEVICE_FUNC nv_bfloat16 operator()(nv_bfloat16 src) const {            \
-      return __float2bfloat16(float_functor(__bfloat162float(src)));          \
-    }                                                                         \
+#define SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR(op)                                     \
+  template<>                                                                                 \
+  struct UnaryFunctor<DeviceType::kCUDA, op, nv_bfloat16, nv_bfloat16> {                     \
+    OF_DEVICE_FUNC UnaryFunctor(Scalar attr0, Scalar attr1) : float_functor(attr0, attr1) {} \
+                                                                                             \
+    UnaryFunctor<DeviceType::kCUDA, op, float, float> float_functor;                         \
+    OF_DEVICE_FUNC nv_bfloat16 operator()(nv_bfloat16 src) const {                           \
+      return __float2bfloat16(float_functor(__bfloat162float(src)));                         \
+    }                                                                                        \
  };

 SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR(UnaryOp::kElu);
@@ -146,6 +342,40 @@ SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR(UnaryOp::kSoftSign);
 SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR(UnaryOp::kSoftPlus);
 SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR(UnaryOp::kTanh);
 SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR(UnaryOp::kThreshold);
+SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR(UnaryOp::kAcos);
+SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR(UnaryOp::kAcosh);
+SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR(UnaryOp::kAsin);
+SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR(UnaryOp::kAsinh);
+SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR(UnaryOp::kAtan);
+SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR(UnaryOp::kAtanh);
+SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR(UnaryOp::kCeil);
+SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR(UnaryOp::kCos);
+SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR(UnaryOp::kCosh);
+SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR(UnaryOp::kErf);
+SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR(UnaryOp::kErfc);
+SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR(UnaryOp::kExp);
+SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR(UnaryOp::kExpm1);
+SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR(UnaryOp::kFloor);
+SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR(UnaryOp::kLgamma);
+SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR(UnaryOp::kLog);
+SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR(UnaryOp::kLog2);
+SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR(UnaryOp::kLog10);
+SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR(UnaryOp::kLog1p);
+SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR(UnaryOp::kLogSigmoid);
+SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR(UnaryOp::kRint);
+SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR(UnaryOp::kRound);
+SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR(UnaryOp::kRsqrt);
+SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR(UnaryOp::kSigmoid);
+SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR(UnaryOp::kSin);
+SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR(UnaryOp::kSinh);
+SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR(UnaryOp::kSqrt);
+SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR(UnaryOp::kSquare);
+SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR(UnaryOp::kTan);
+SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR(UnaryOp::kReciprocalNoNan);
+SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR(UnaryOp::kNotEqualZero);
+SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR(UnaryOp::kNanAssign);
+SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR(UnaryOp::kFastGelu);
+SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR(UnaryOp::kQuickGelu);

 template<>
 struct UnaryFunctor<DeviceType::kCUDA, UnaryOp::kIsInf, bool, nv_bfloat16> {
@@ -160,8 +390,26 @@ struct UnaryFunctor<DeviceType::kCUDA, UnaryOp::kIsNan, bool, nv_bfloat16> {

  OF_DEVICE_FUNC bool operator()(nv_bfloat16 src) const { return isnan(__bfloat162float(src)); }
 };
+template<>
+struct UnaryFunctor<DeviceType::kCUDA, UnaryOp::kIsFinite, bool, nv_bfloat16> {
+  UnaryFunctor(Scalar attr0, Scalar attr1) {}

-#endif
+  OF_DEVICE_FUNC bool operator()(nv_bfloat16 src) const { return isfinite(__bfloat162float(src)); }
+};
+
+template<>
+struct UnaryFunctor<DeviceType::kCUDA, UnaryOp::kTrunc, nv_bfloat16, nv_bfloat16> {
+  OF_DEVICE_FUNC UnaryFunctor(Scalar attr0, Scalar attr1) {}
+  __device__ nv_bfloat16 operator()(nv_bfloat16 src) const {
+#if CUDA_ARCH >= 800
+    return htrunc(src);
+#else
+    return __float2bfloat16(truncf(__bfloat162float(src)));
+#endif  // CUDA_ARCH >= 800
+  }
+};
+
+#endif  // CUDA_VERSION >= 11000

 }  // namespace primitive
 }  // namespace ep

--- a/oneflow/core/ep/include/device.h
+++ b/oneflow/core/ep/include/device.h
@@ -21,6 +21,7 @@ limitations under the License.
 #include "oneflow/core/ep/include/event.h"
 #include "oneflow/core/ep/include/stream.h"
 #include "oneflow/core/ep/include/allocation_options.h"
+#include "oneflow/core/ep/include/gpu_macro.h"

 namespace oneflow {


--- a/oneflow/core/ep/include/event.h
+++ b/oneflow/core/ep/include/event.h
@@ -18,6 +18,7 @@ limitations under the License.

 #include "oneflow/core/common/util.h"
 #include "oneflow/core/common/maybe.h"
+#include "oneflow/core/ep/include/gpu_macro.h"

 namespace oneflow {


--- a/oneflow/core/ep/include/gpu_macro.h
+++ b/oneflow/core/ep/include/gpu_macro.h
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#ifndef ONEFLOW_CORE_EP_GPU_MACRO_H_
+#define ONEFLOW_CORE_EP_GPU_MACRO_H_
+
+#ifdef WITH_ROCM
+#include <hip/hip_runtime.h>
+
+#define GPU(str) hip##str
+#define GPURAND(str) hiprand##str
+#define GPUMultiProcessorCount hipDeviceAttributeMultiprocessorCount
+#define GPUMaxThreadsPerMultiProcessor hipDeviceAttributeMaxThreadsPerMultiProcessor
+#define GPUMaxSharedMemoryPerBlockOptin hipDeviceAttributeSharedMemPerBlockOptin
+
+__device__ __forceinline__ void TRAP()
+{
+    asm volatile("s_trap 0;");
+}
+
+#else
+#include <cuda.h>
+
+#define GPU(str) cuda##str
+#define GPURAND(str) curand##str
+#define GPUMultiProcessorCount cudaDevAttrMultiProcessorCount
+#define GPUMaxThreadsPerMultiProcessor cudaDevAttrMaxThreadsPerMultiProcessor
+#define GPUMaxSharedMemoryPerBlockOptin cudaDevAttrMaxSharedMemoryPerBlockOptin
+
+__device__ __forceinline__ void TRAP()
+{
+    __trap();
+}
+
+#endif
+
+#endif // ONEFLOW_CORE_EP_GPU_MACRO_H_
\ No newline at end of file
--- a/oneflow/core/ep/include/primitive/binary_op.h
+++ b/oneflow/core/ep/include/primitive/binary_op.h
@@ -32,6 +32,12 @@ enum class BinaryOp {
  kMax,
  kMin,
  kPow,
+  kFmod,
+  kFloorDiv,
+  kTruncDiv,
+  kFloorMod,
+  kScalarBasePowerGrad,
+  kScalarExpPowerGrad,
  // Comparision
  kEqual,
  kNotEqual,
@@ -39,6 +45,8 @@ enum class BinaryOp {
  kLessEqual,
  kGreaterThan,
  kGreaterEqual,
+  kIsClose,
+  kIsCloseEqualNan,
  // Logical
  kLogicalAnd,
  kLogicalOr,
@@ -62,7 +70,35 @@ enum class BinaryOp {
  kTanhBackwardWithDyX,
  kThresholdBackwardWithDyX,
  kSigmoidBackwardWithDyY,
-
+  kAbsBackwardWithDyX,
+  kAcosBackwardWithDyX,
+  kAcoshBackwardWithDyX,
+  kAsinBackwardWithDyX,
+  kAsinhBackwardWithDyX,
+  kAtanBackwardWithDyX,
+  kAtanhBackwardWithDyX,
+  kCosBackwardWithDyX,
+  kCoshBackwardWithDyX,
+  kErfBackwardWithDyX,
+  kErfcBackwardWithDyX,
+  kExpBackwardWithDyX,
+  kExpm1BackwardWithDyX,
+  kLgammaBackwardWithDyX,
+  kLogBackwardWithDyX,
+  kLog2BackwardWithDyX,
+  kLog10BackwardWithDyX,
+  kLog1pBackwardWithDyX,
+  kLogSigmoidBackwardWithDyX,
+  kReciprocalBackwardWithDyX,
+  kReciprocalNoNanBackwardWithDyX,
+  kRsqrtBackwardWithDyX,
+  kSinBackwardWithDyX,
+  kSinhBackwardWithDyX,
+  kSqrtBackwardWithDyX,
+  kSquareBackwardWithDyX,
+  kTanBackwardWithDyX,
+  kFastGeluBackwardWithDyX,
+  kQuickGeluBackwardWithDyX,
 };

 }

--- a/oneflow/core/ep/include/primitive/broadcast_elementwise_unary.h
+++ b/oneflow/core/ep/include/primitive/broadcast_elementwise_unary.h
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#ifndef ONEFLOW_CORE_EP_PRIMITIVE_BROADCAST_ELEMENTWISE_UNARY_H_
+#define ONEFLOW_CORE_EP_PRIMITIVE_BROADCAST_ELEMENTWISE_UNARY_H_
+
+#include "oneflow/core/ep/include/primitive/primitive.h"
+#include "oneflow/core/ep/include/primitive/unary_op.h"
+#include "oneflow/core/common/scalar.h"
+
+namespace oneflow {
+
+namespace ep {
+namespace primitive {
+
+class BroadcastElementwiseUnary : public Primitive {
+ public:
+  OF_DISALLOW_COPY_AND_MOVE(BroadcastElementwiseUnary);
+  BroadcastElementwiseUnary() = default;
+  ~BroadcastElementwiseUnary() override = default;
+
+  virtual void Launch(Stream* stream, size_t num_src_dims, const int64_t* src_dims,
+                      const int64_t* src_strides, const void* src, size_t num_dst_dims,
+                      const int64_t* dst_dims, const int64_t* dst_strides, void* dst) = 0;
+
+  virtual void Launch(Stream* stream, size_t num_src_dims, const int64_t* src_dims, const void* src,
+                      size_t num_dst_dims, const int64_t* dst_dims, void* dst) = 0;
+};
+
+class BroadcastElementwiseUnaryFactory : public Factory<BroadcastElementwiseUnary> {
+ public:
+  OF_DISALLOW_COPY_AND_MOVE(BroadcastElementwiseUnaryFactory);
+  BroadcastElementwiseUnaryFactory() = default;
+  ~BroadcastElementwiseUnaryFactory() override = default;
+
+  virtual std::unique_ptr<BroadcastElementwiseUnary> New(UnaryOp op, DataType src_type,
+                                                         DataType dst_type,
+                                                         size_t max_num_dims) = 0;
+
+  virtual std::unique_ptr<BroadcastElementwiseUnary> New(UnaryOp op, DataType src_type,
+                                                         DataType dst_type, size_t max_num_dims,
+                                                         Scalar attr0) = 0;
+
+  virtual std::unique_ptr<BroadcastElementwiseUnary> New(UnaryOp op, DataType src_type,
+                                                         DataType dst_type, size_t max_num_dims,
+                                                         Scalar attr0, Scalar attr1) = 0;
+};
+
+}  // namespace primitive
+}  // namespace ep
+
+}  // namespace oneflow
+
+#endif  // ONEFLOW_CORE_EP_PRIMITIVE_BROADCAST_ELEMENTWISE_UNARY_H_
--- a/oneflow/core/ep/include/primitive/fast_integer_math.h
+++ b/oneflow/core/ep/include/primitive/fast_integer_math.h
@@ -16,8 +16,10 @@ limitations under the License.
 #ifndef ONEFLOW_CORE_EP_PRIMITIVE_FAST_INTEGER_MATH_H_
 #define ONEFLOW_CORE_EP_PRIMITIVE_FAST_INTEGER_MATH_H_
 #include "oneflow/core/common/data_type.h"
+#ifdef WITH_ROCM
+#include "hip/device_functions.h" // /opt/rocm/hip/include/hip
+#endif
 #include <cassert>
-#include "device_functions.h" // /opt/rocm/hip/include/hip

 namespace oneflow {


--- a/oneflow/core/ep/include/primitive/tensor_fill.h
+++ b/oneflow/core/ep/include/primitive/tensor_fill.h
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#ifndef ONEFLOW_CORE_EP_PRIMITIVE_TENSOR_FILL_H_
+#define ONEFLOW_CORE_EP_PRIMITIVE_TENSOR_FILL_H_
+
+#include "oneflow/core/ep/include/primitive/primitive.h"
+
+namespace oneflow {
+
+namespace ep {
+namespace primitive {
+
+class TensorFill : public Primitive {
+ public:
+  OF_DISALLOW_COPY_AND_MOVE(TensorFill);
+  TensorFill() = default;
+  ~TensorFill() override = default;
+
+  virtual void Launch(Stream* stream, const void* src, void* dst, size_t count) = 0;
+};
+
+class TensorFillFactory : public Factory<TensorFill> {
+ public:
+  OF_DISALLOW_COPY_AND_MOVE(TensorFillFactory);
+  TensorFillFactory() = default;
+  ~TensorFillFactory() override = default;
+
+  virtual std::unique_ptr<TensorFill> New(DataType data_type) = 0;
+};
+
+}  // namespace primitive
+}  // namespace ep
+
+}  // namespace oneflow
+
+#endif  // ONEFLOW_CORE_EP_PRIMITIVE_TENSOR_FILL_H_
--- a/oneflow/core/ep/include/primitive/unary_op.h
+++ b/oneflow/core/ep/include/primitive/unary_op.h
@@ -22,6 +22,7 @@ namespace ep {
 namespace primitive {

 enum class UnaryOp {
+  kIdentity,
  // activation op
  kElu,
  kCelu,
@@ -40,13 +41,53 @@ enum class UnaryOp {
  kSoftPlus,
  kTanh,
  kThreshold,
-
+  kFastGelu,
+  kQuickGelu,
+  // math op
+  kAbs,
+  kAcos,
+  kAcosh,
+  kAsin,
+  kAsinh,
+  kAtan,
+  kAtanh,
+  kCeil,
+  kCos,
+  kCosh,
+  kErf,
+  kErfc,
+  kExp,
+  kExpm1,
+  kFloor,
+  kLgamma,
+  kLog,
+  kLog2,
+  kLog10,
+  kLog1p,
+  kLogSigmoid,
+  kNegative,
+  kReciprocal,
+  kReciprocalNoNan,
+  kRint,
+  kRound,
+  kRsqrt,
+  kSigmoid,
+  kSign,
+  kSin,
+  kSinh,
+  kSqrt,
+  kSquare,
+  kTan,
+  kTrunc,
+  kNotEqualZero,
  // logical op
  kLogicalNot,

  // utils op
  kIsInf,
  kIsNan,
+  kIsFinite,
+  kNanAssign,
 };

 }

--- a/oneflow/core/ep/include/stream.h
+++ b/oneflow/core/ep/include/stream.h
@@ -20,6 +20,7 @@ limitations under the License.
 #include "oneflow/core/common/device_type.h"
 #include "oneflow/core/common/maybe.h"
 #include "oneflow/core/ep/include/event.h"
+#include "oneflow/core/ep/include/gpu_macro.h"

 namespace oneflow {

@@ -37,6 +38,7 @@ class Stream {
  virtual Device* device() const = 0;
  virtual Maybe<void> Sync() = 0;
  virtual void RecordEvent(Event* event) = 0;
+  virtual Maybe<void> GetAsyncError() { return Maybe<void>::Ok(); }

  virtual Maybe<void> OnExecutionContextSetup() { return Maybe<void>::Ok(); }
  virtual Maybe<void> OnExecutionContextTeardown() { return Maybe<void>::Ok(); }

--- a/oneflow/core/ep/rocm/cuda_device.cpp
+++ b/oneflow/core/ep/rocm/cuda_device.cpp
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#include "oneflow/core/ep/rocm/cuda_device.h"
-#include "oneflow/core/ep/rocm/cuda_event.h"
-#include "oneflow/core/ep/rocm/cuda_stream.h"
-
-#ifdef WITH_ROCM
-
-#include <hip/hip_runtime.h>
-#include <hip/hip_fp16.h>
-
-// #if CUDA_VERSION >= 11000
-// #include <cuda_bf16.h>
-// #endif
-
-namespace oneflow {
-
-namespace ep {
-
-namespace {
-
-constexpr size_t kDefaultConstBufElementCount = 1024 * 1024;
-
-template<typename T>
-void CreateConstBuffer(void** buf, T value, size_t n) {
-  OF_CUDA_CHECK(hipMalloc(buf, n * sizeof(T)));
-  std::vector<T> host(n, value);
-  OF_CUDA_CHECK(hipMemcpy(*buf, host.data(), n * sizeof(T), hipMemcpyDefault));
-}
-
-}  // namespace
-
-CudaDevice::CudaDevice(int device_index, DeviceManager* device_manager)
-    : device_index_(device_index),
-      event_flags_{},
-      properties_{},
-      device_manager_(device_manager),
-      const_buf_elem_cnt_(0),
-      const_zeros_buffer_(nullptr),
-      const_ones_buffer_fp32_(nullptr),
-      const_ones_buffer_fp16_(nullptr),
-      const_ones_buffer_bf16_(nullptr) {
-  CudaCurrentDeviceGuard guard(device_index_);
-  OF_CUDA_CHECK(hipGetDeviceProperties(&properties_, device_index_));
-  event_flags_ = hipEventDisableTiming;
-  if (ParseBooleanFromEnv("ONEFLOW_STREAM_CUDA_EVENT_FLAG_BLOCKING_SYNC", false)) {
-    event_flags_ |= hipEventBlockingSync;
-  }
-  const_buf_elem_cnt_ = ParseIntegerFromEnv("ONEFLOW_EP_CUDA_CONST_BUFFER_ELEMENT_COUNT",
-                                            kDefaultConstBufElementCount);
-  if (const_buf_elem_cnt_ > 0) {
-    CreateConstBuffer<float>(&const_zeros_buffer_, static_cast<float>(0), const_buf_elem_cnt_);
-    CreateConstBuffer<float>(&const_ones_buffer_fp32_, static_cast<float>(1.0),
-                             const_buf_elem_cnt_);
-    CreateConstBuffer<half>(&const_ones_buffer_fp16_, static_cast<half>(1.0), const_buf_elem_cnt_);
-// #if CUDA_VERSION >= 11000
-//     CreateConstBuffer<nv_bfloat16>(&const_ones_buffer_bf16_, static_cast<nv_bfloat16>(1.0),
-//                                    const_buf_elem_cnt_);
-// #endif
-  }
-}
-
-CudaDevice::~CudaDevice() {
-  CudaCurrentDeviceGuard guard(device_index_);
-  for (auto* event : events_) { delete event; }
-  OF_CUDA_CHECK(hipFree(const_zeros_buffer_));
-  OF_CUDA_CHECK(hipFree(const_ones_buffer_fp32_));
-  OF_CUDA_CHECK(hipFree(const_ones_buffer_fp16_));
-  OF_CUDA_CHECK(hipFree(const_ones_buffer_bf16_));
-}
-
-void CudaDevice::SetAsActiveDevice() { OF_CUDA_CHECK(hipSetDevice(device_index_)); }
-
-Stream* CudaDevice::CreateStream() {
-  CudaCurrentDeviceGuard guard(device_index_);
-  return new CudaStream(this);
-}
-
-void CudaDevice::DestroyStream(Stream* stream) {
-  CudaCurrentDeviceGuard guard(device_index_);
-  delete stream;
-}
-
-void CudaDevice::CreateEvents(Event** events, size_t count) {
-  size_t copied = 0;
-  {
-    std::lock_guard<std::mutex> lock(events_mutex_);
-    copied = std::min(count, events_.size());
-    size_t offset = events_.size() - copied;
-    std::copy(events_.begin() + offset, events_.end(), events);
-    events_.resize(offset);
-  }
-  if (copied != count) {
-    CudaCurrentDeviceGuard guard(device_index_);
-    for (size_t i = copied; i < count; ++i) { events[i] = new CudaEvent(event_flags_); }
-  }
-}
-
-void CudaDevice::DestroyEvents(Event** events, size_t count) {
-  std::lock_guard<std::mutex> lock(events_mutex_);
-  events_.insert(events_.end(), events, events + count);
-}
-
-Maybe<void> CudaDevice::Alloc(const AllocationOptions& options, void** ptr, size_t size) {
-  CudaCurrentDeviceGuard guard(device_index_);
-  CHECK(!options.HasPinnedDevice());
-  hipError_t err = hipMalloc(ptr, size);
-  if (err != hipSuccess) {
-    return Error::RuntimeError() << hipGetErrorString(err);
-  } else {
-    return Maybe<void>::Ok();
-  }
-}
-
-void CudaDevice::Free(const AllocationOptions& attr, void* ptr) {
-  CudaCurrentDeviceGuard guard(device_index_);
-  OF_CUDA_CHECK(hipFree(ptr));
-}
-
-Maybe<void> CudaDevice::AllocPinned(const AllocationOptions& options, void** ptr, size_t size) {
-  CudaCurrentDeviceGuard guard(device_index_);
-  hipError_t err = NumaAwareCudaMallocHost(device_index_, ptr, size);
-  if (err != hipSuccess) {
-    return Error::RuntimeError() << hipGetErrorString(err);
-  } else {
-    return Maybe<void>::Ok();
-  }
-}
-
-void CudaDevice::FreePinned(const AllocationOptions& options, void* ptr) {
-  CudaCurrentDeviceGuard guard(device_index_);
-  OF_CUDA_CHECK(hipHostFree(ptr));
-}
-
-const hipDeviceProp_t& CudaDevice::properties() const { return properties_; }
-
-const void* CudaDevice::GetConstZeros(DataType data_type, size_t n) const {
-  if (GetSizeOfDataType(data_type) * n
-      <= GetSizeOfDataType(DataType::kFloat) * const_buf_elem_cnt_) {
-    return const_zeros_buffer_;
-  } else {
-    return nullptr;
-  }
-}
-
-const void* CudaDevice::GetConstOnes(DataType data_type, size_t n) const {
-  if (n <= const_buf_elem_cnt_) {
-    if (data_type == DataType::kFloat) {
-      return const_ones_buffer_fp32_;
-    } else if (data_type == DataType::kFloat16) {
-      return const_ones_buffer_fp16_;
-    } else if (data_type == DataType::kBFloat16) {
-      return const_ones_buffer_bf16_;
-    } else {
-      return nullptr;
-    }
-  } else {
-    return nullptr;
-  }
-}
-
-}  // namespace ep
-
-}  // namespace oneflow
-
-#endif  // WITH_ROCM
--- a/oneflow/core/ep/rocm/cuda_device.h
+++ b/oneflow/core/ep/rocm/cuda_device.h
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#ifndef ONEFLOW_CORE_EP_ROCM_CUDA_DEVICE_H_
-#define ONEFLOW_CORE_EP_ROCM_CUDA_DEVICE_H_
-
-#include "oneflow/core/ep/include/device.h"
-#include "oneflow/core/common/data_type.h"
-
-#ifdef WITH_ROCM
-
-#include <hip/hip_runtime.h>
-
-namespace oneflow {
-
-namespace ep {
-
-class CudaDevice : public Device {
- public:
-  OF_DISALLOW_COPY_AND_MOVE(CudaDevice);
-  explicit CudaDevice(int device_index, DeviceManager* device_manager);
-  ~CudaDevice() override;
-
-  void SetAsActiveDevice() override;
-
-  DeviceType device_type() const override { return DeviceType::kCUDA; }
-  size_t device_index() const override { return device_index_; }
-  DeviceManager* device_manager() const override { return device_manager_; }
-
-  Stream* CreateStream() override;
-  void DestroyStream(Stream* stream) override;
-
-  void CreateEvents(Event** events, size_t count) override;
-  void DestroyEvents(Event** events, size_t count) override;
-
-  Maybe<void> Alloc(const AllocationOptions& options, void** ptr, size_t size) override;
-  void Free(const AllocationOptions& options, void* ptr) override;
-  Maybe<void> AllocPinned(const AllocationOptions& options, void** ptr, size_t size) override;
-  void FreePinned(const AllocationOptions& options, void* ptr) override;
-
-  const hipDeviceProp_t& properties() const;
-
-  const void* GetConstZeros(DataType data_type, size_t n) const;
-  const void* GetConstOnes(DataType data_type, size_t n) const;
-
- private:
-  int device_index_;
-  std::mutex events_mutex_;
-  std::vector<Event*> events_;
-  unsigned int event_flags_;
-  hipDeviceProp_t properties_;
-  DeviceManager* device_manager_;
-  int64_t const_buf_elem_cnt_;
-  void* const_zeros_buffer_;
-  void* const_ones_buffer_fp32_;
-  void* const_ones_buffer_fp16_;
-  void* const_ones_buffer_bf16_;
-};
-
-}  // namespace ep
-
-}  // namespace oneflow
-
-#endif  // WITH_ROCM
-
-#endif  // ONEFLOW_CORE_EP_ROCM_CUDA_DEVICE_H_
--- a/oneflow/core/ep/rocm/cuda_device_manager.cpp
+++ b/oneflow/core/ep/rocm/cuda_device_manager.cpp
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#include "oneflow/core/ep/rocm/cuda_device_manager.h"
-#include "oneflow/core/device/cuda_util.h"
-
-#ifdef WITH_ROCM
-
-namespace oneflow {
-
-namespace ep {
-
-CudaDeviceManager::CudaDeviceManager(DeviceManagerRegistry* registry) : registry_(registry) {}
-CudaDeviceManager::~CudaDeviceManager() = default;
-
-DeviceManagerRegistry* CudaDeviceManager::registry() const { return registry_; }
-
-std::shared_ptr<Device> CudaDeviceManager::GetDevice(size_t device_index) {
-  std::lock_guard<std::mutex> lock(devices_mutex_);
-  if (device_index < devices_.size() && devices_.at(device_index)) {
-    return devices_.at(device_index);
-  }
-  auto device = std::make_shared<CudaDevice>(device_index, this);
-  if (device_index >= devices_.size()) { devices_.resize(device_index + 1); }
-  devices_.at(device_index) = device;
-  return device;
-}
-
-size_t CudaDeviceManager::GetDeviceCount(size_t primary_device_index) {
-  CudaCurrentDeviceGuard guard(primary_device_index);
-  return this->GetDeviceCount();
-}
-
-size_t CudaDeviceManager::GetDeviceCount() {
-  int count = 0;
-  hipError_t err = hipGetDeviceCount(&count);
-  if (err == hipErrorNoDevice || err == hipErrorInsufficientDriver) { return 0; }
-  OF_CUDA_CHECK(err);
-  return count;
-}
-
-size_t CudaDeviceManager::GetActiveDeviceIndex() {
-  int device = 0;
-  OF_CUDA_CHECK(hipGetDevice(&device));
-  return static_cast<size_t>(device);
-}
-
-void CudaDeviceManager::SetActiveDeviceByIndex(size_t device_index) {
-  OF_CUDA_CHECK(hipSetDevice(static_cast<int>(device_index)));
-}
-
-}  // namespace ep
-
-}  // namespace oneflow
-
-#endif  // WITH_ROCM