dtk

8f7de847 · yuguo960516yuguo · f262efc9 · 8f7de847 · 8f7de847 · 8f7de847
Commit 8f7de847 authored Apr 25, 2023 by yuguo960516yuguo
20 changed files
--- a/oneflow/core/ndarray/xpu_ndarray_assign.hip.cpp
+++ b/oneflow/core/ndarray/xpu_ndarray_assign.hip.cpp
 /*
 Copyright 2020 The OneFlow Authors. All rights reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 */
 #include "hip/hip_runtime.h"
 #include "oneflow/core/ndarray/ndarray_assign_core.h"
 #include "oneflow/core/device/cuda_util.h"
 #include "oneflow/core/kernel/kernel_util.h"
 namespace oneflow {
 namespace {
 template<typename T, typename X, int NDIMS>
 __global__ void NdarrayAssignReducedGpu(XpuVarNdarray<T> y,
                                        const XpuReducedNdarray<X, NDIMS> reduced) {
  NdarrayAssignCore<T, X, NDIMS>::Assign(y, reduced);
 }
 template<typename T, typename X, int NDIMS>
 __global__ void NdarrayAssignGpu(XpuVarNdarray<T> y, const XpuVarNdarray<const X> x) {
  NdarrayAssignCore<T, X, NDIMS>::Assign(y, x);
 }
 }  // namespace
 template<typename T, typename X, int NDIMS>
 struct NdarrayAssignCoreWrapper<DeviceType::kCUDA, T, X, NDIMS> final {
  static void Assign(ep::Stream* stream, XpuVarNdarray<T>* y,
                     const XpuReducedNdarray<X, NDIMS>& reduced) {
    size_t n = y->host_shape().HostElemNum();
    RUN_CUDA_KERNEL((NdarrayAssignReducedGpu<T, X, NDIMS>), stream, n, *y, reduced);
  }
  static void Assign(ep::Stream* ctx, const XpuVarNdarray<T>& y, const XpuVarNdarray<const X>& x) {
    size_t n = y.host_shape().HostElemNum();
    if (n == 0) { return; }
    RUN_CUDA_KERNEL((NdarrayAssignGpu<T, X, NDIMS>), ctx, n, y, x);
  }
 };
 #define INSTANTIATE_NDARRAY_ASSIGN(ret_dtype_pair, dtype_pair, NDIMS)                           \
  template struct NdarrayAssignCoreWrapper<DeviceType::kCUDA, OF_PP_PAIR_FIRST(ret_dtype_pair), \
                                           OF_PP_PAIR_FIRST(dtype_pair), NDIMS>;
 OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(
    INSTANTIATE_NDARRAY_ASSIGN,
    ARITHMETIC_DATA_TYPE_SEQ UNSIGNED_INT_DATA_TYPE_SEQ BOOL_DATA_TYPE_SEQ,
    ARITHMETIC_DATA_TYPE_SEQ UNSIGNED_INT_DATA_TYPE_SEQ BOOL_DATA_TYPE_SEQ, DIM_SEQ);
 OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INSTANTIATE_NDARRAY_ASSIGN, HALF_DATA_TYPE_SEQ, HALF_DATA_TYPE_SEQ,
                                 DIM_SEQ);
 }  // namespace oneflow
\ No newline at end of file
--- a/oneflow/core/profiler/event.cpp
+++ b/oneflow/core/profiler/event.cpp
 /*
 Copyright 2020 The OneFlow Authors. All rights reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 */
 #include "fmt/core.h"
 #include "fmt/format.h"
 #include "oneflow/core/profiler/event.h"
 #include "oneflow/core/profiler/util.h"
 using json = nlohmann::json;
 namespace oneflow {
 namespace profiler {
 nlohmann::json IEvent::ToJson() {
  return json{{"name", name_}, {"time", GetDuration<double>()}, {"input_shapes", "-"}};
 }
 void IEvent::SetStartedAt(double t) { started_at_ = t; }
 void IEvent::SetFinishedAt(double t) { finished_at_ = t; }
 void IEvent::Start() { SetStartedAt(GetTimeNow()); }
 void IEvent::Finish() { SetFinishedAt(GetTimeNow()); }
 bool IEvent::IsChildOf(const IEvent* e) {
  if (!e) { return false; }
  if (this == e) { return false; }
  return GetStartedAt<double>() >= e->GetStartedAt<double>()
         && GetFinishedAt<double>() <= e->GetFinishedAt<double>();
 }
 const std::string& IEvent::GetName() const { return name_; }
 std::string CustomEvent::Key() { return name_; }
 nlohmann::json CustomEvent::ToJson() {
  auto j = IEvent::ToJson();
  j["type"] = EventType::kCustom;
  j["custom_type"] = type_;
  return j;
 }
 std::shared_ptr<CustomEvent> CustomEvent::Create(const std::string& name, CustomEventType type) {
  return std::shared_ptr<CustomEvent>(new CustomEvent(name, type));
 }
 std::string KernelEvent::Key() { return fmt::format("{}.{}", name_, GetFormatedInputShapes()); }
 nlohmann::json KernelEvent::ToJson() {
  auto j = IEvent::ToJson();
  j["type"] = EventType::kOneflowKernel;
  j["input_shapes"] = GetFormatedInputShapes();
 #if defined(WITH_CUDA) || defined(WITH_ROCM)
  j["memory_size"] = memory_size_;
  if (!children_.empty()) { j["children"] = children_; }
 #endif  // WITH_CUDA
  return j;
 }
 std::shared_ptr<KernelEvent> KernelEvent::Create(
    const std::string& name, const std::function<std::vector<Shape>(void)>& shape_getter) {
  return std::shared_ptr<KernelEvent>(new KernelEvent(name, shape_getter));
 }
 std::string KernelEvent::GetFormatedInputShapes(size_t max_num_to_format) {
  if (input_shapes_.size() == 0) { return "-"; }
  std::vector<std::string> shapes_formated(std::min(input_shapes_.size(), max_num_to_format));
  for (auto i = 0; i < shapes_formated.size(); ++i) {
    const std::string current_shape = input_shapes_[i].ToString();
    shapes_formated[i] = current_shape == "()" ? "scalar" : current_shape;
  }
  if (input_shapes_.size() > max_num_to_format) { shapes_formated.emplace_back("..."); }
  return fmt::format("[{}]", fmt::join(shapes_formated, ", "));
 }
 }  // namespace profiler
 }  // namespace oneflow
\ No newline at end of file
--- a/oneflow/core/profiler/event.h
+++ b/oneflow/core/profiler/event.h
 /*
 Copyright 2020 The OneFlow Authors. All rights reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 */
 #ifndef ONEFLOW_CORE_PROFILER_EVENT_H_
 #define ONEFLOW_CORE_PROFILER_EVENT_H_
 #include <functional>
 #include <memory>
 #include <vector>
 #include "nlohmann/json.hpp"
 #include "oneflow/core/common/util.h"
 #include "oneflow/core/common/shape_view.h"
 namespace oneflow {
 namespace profiler {
 class ProfileManager;
 enum class EventType {
  kCustom,        // has three kinds
  kOneflowKernel  // OneFlow cpu/cuda kernel
 };
 enum class CustomEventType {
  kDefault,     // for record_function
  kCudaKernel,  // cuda kernel
  kCudaRuntime  // something like cudaLaunchKernel
 };
 enum class EventTimeUnit { kNS, kUS };
 class IEvent {
 public:
  OF_DISALLOW_COPY_AND_MOVE(IEvent);
  IEvent() = delete;
  IEvent(const std::string& name, EventTimeUnit time_unit) : name_(name), time_unit_(time_unit) {}
  virtual std::string Key() = 0;
  virtual nlohmann::json ToJson();
  virtual ~IEvent() = default;
  virtual void Start();
  virtual void Finish();
  bool IsChildOf(const IEvent* e);
  const std::string& GetName() const;
  template<typename T>
  const T GetDuration(EventTimeUnit time_unit = EventTimeUnit::kUS) const;
  template<typename T>
  const T GetStartedAt(EventTimeUnit time_unit = EventTimeUnit::kUS) const;
  template<typename T>
  const T GetFinishedAt(EventTimeUnit time_unit = EventTimeUnit::kUS) const;
 protected:
  virtual void SetStartedAt(double t);
  virtual void SetFinishedAt(double t);
  std::string name_;
  EventTimeUnit time_unit_;
  double started_at_ = 0;
  double finished_at_ = 0;
 };
 inline double ConvertTime(double time_, EventTimeUnit src_time_unit, EventTimeUnit dst_time_unit) {
  if (src_time_unit == EventTimeUnit::kNS && dst_time_unit == EventTimeUnit::kUS) {
    return time_ / 1000;
  }
  if (src_time_unit == EventTimeUnit::kUS && dst_time_unit == EventTimeUnit::kNS) {
    return time_ * 1000;
  }
  return time_;
 }
 template<>
 const inline double IEvent::GetStartedAt<double>(EventTimeUnit time_unit) const {
  return ConvertTime(started_at_, time_unit_, time_unit);
 }
 template<>
 const inline time_t IEvent::GetStartedAt<time_t>(EventTimeUnit time_unit) const {
  return static_cast<time_t>(GetStartedAt<double>(time_unit));
 }
 template<>
 const inline double IEvent::GetFinishedAt<double>(EventTimeUnit time_unit) const {
  return ConvertTime(finished_at_, time_unit_, time_unit);
 }
 template<>
 const inline time_t IEvent::GetFinishedAt<time_t>(EventTimeUnit time_unit) const {
  return static_cast<time_t>(GetFinishedAt<double>(time_unit));
 }
 template<>
 const inline double IEvent::GetDuration<double>(EventTimeUnit time_unit) const {
  return GetFinishedAt<double>(time_unit) - GetStartedAt<double>(time_unit);
 }
 template<>
 const inline time_t IEvent::GetDuration<time_t>(EventTimeUnit time_unit) const {
  return static_cast<time_t>(GetDuration<double>(time_unit));
 }
 class CustomEvent final : public IEvent {
 public:
  friend class ProfileManager;
  std::string Key() override;
  nlohmann::json ToJson() override;
  static std::shared_ptr<CustomEvent> Create(const std::string& name,
                                             CustomEventType type = CustomEventType::kDefault);
 private:
  CustomEventType type_;
  CustomEvent(const std::string& custom_name, CustomEventType type)
      : IEvent(custom_name,
               type == CustomEventType::kDefault ? EventTimeUnit::kNS : EventTimeUnit::kUS),
        type_(type) {}
 };
 class KernelEvent final : public IEvent {
 public:
  std::string Key() override;
  nlohmann::json ToJson() override;
  static std::shared_ptr<KernelEvent> Create(
      const std::string& name, const std::function<std::vector<Shape>(void)>& shape_getter);
 #if defined(WITH_CUDA) || defined(WITH_ROCM)
  void SetMemorySize(int64_t memory_size) { memory_size_ = memory_size; }
  void AddChildEvent(const std::shared_ptr<IEvent>& e) { children_.emplace(e); }
  bool AddChildEventIfSo(const std::shared_ptr<IEvent>& e) {
    if (e->IsChildOf(dynamic_cast<IEvent*>(this))) {
      children_.emplace(e);
      return true;
    }
    return false;
  }
  bool HasChildEvent(const std::shared_ptr<IEvent>& e) { return children_.count(e); }
  void WalkAmongChildren(const std::function<void(const std::shared_ptr<IEvent>& e)>& f) const {
    for (const auto& x : children_) { f(x); }
  }
 #endif  // WITH_CUDA
 private:
  KernelEvent(const std::string& kernel_name,
              const std::function<std::vector<Shape>(void)>& shape_getter)
      : IEvent(kernel_name, EventTimeUnit::kNS) {
    if (shape_getter) { input_shapes_ = shape_getter(); }
  }
 #if defined(WITH_CUDA) || defined(WITH_ROCM)
  int64_t memory_size_ = -1;
  std::set<std::shared_ptr<IEvent>> children_;
 #endif  // WITH_CUDA
  std::vector<Shape> input_shapes_;
  std::string GetFormatedInputShapes(size_t max_num_to_format = 4);
 };
 }  // namespace profiler
 }  // namespace oneflow
 namespace nlohmann {
 inline void to_json(json& j, const std::shared_ptr<::oneflow::profiler::IEvent>& event) {
  j = event->ToJson();
 }
 }  // namespace nlohmann
 #endif  // ONEFLOW_CORE_PROFILER_EVENT_H_
--- a/oneflow/core/profiler/event_recorder.h
+++ b/oneflow/core/profiler/event_recorder.h
 /*
 Copyright 2020 The OneFlow Authors. All rights reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 */
 #ifndef ONEFLOW_CORE_PROFILER_EVENT_RECORDER_H_
 #define ONEFLOW_CORE_PROFILER_EVENT_RECORDER_H_
 #include "oneflow/core/common/util.h"
 #include "oneflow/core/profiler/event.h"
 namespace oneflow {
 namespace profiler {
 class EventRecorder {
 public:
  using ShapeGetterFuncType = std::function<std::vector<Shape>(void)>;
  OF_DISALLOW_COPY_AND_MOVE(EventRecorder);
  explicit EventRecorder(const std::shared_ptr<IEvent>& event) : event_(event) {
    CHECK_JUST(RegisterEventToProfileManager(event));
    event_->Start();
  }
  Maybe<void> RegisterEventToProfileManager(const std::shared_ptr<IEvent>& event);
  ~EventRecorder() {
    if (event_) {
      event_->Finish();
      event_.reset();
    }
  }
  static std::shared_ptr<EventRecorder> CreateCustomEventRecorder(const std::string& name);
  static Maybe<EventRecorder> CreateKernelEventRecorder(
      const std::string& name,
 #if defined(WITH_CUDA) || defined(WITH_ROCM)
      const std::function<int64_t()>& memory_size_getter,
 #endif
      const ShapeGetterFuncType& shape_getter);
 private:
  std::shared_ptr<IEvent> event_;
 };
 }  // namespace profiler
 }  // namespace oneflow
 #endif  // ONEFLOW_CORE_PROFILER_EVENT_RECORDER_H_
--- a/oneflow/core/vm/sync_vm_mode_guard.h
+++ b/oneflow/core/vm/sync_vm_mode_guard.h
 /*
 Copyright 2020 The OneFlow Authors. All rights reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 */
 #ifndef ONEFLOW_CORE_VM_SYNC_VM_MODE_GUARD_H_
 #define ONEFLOW_CORE_VM_SYNC_VM_MODE_GUARD_H_
 #include "oneflow/core/common/thread_local_guard.h"
 namespace oneflow {
 enum class SyncVmMode {
  kInvalid = 0,
  kEnable = 1,
  kDisable = 2,
 };
 class SyncVmModeGuard final : public ThreadLocalGuard<SyncVmMode> {
 public:
  using ThreadLocalGuard<SyncVmMode>::ThreadLocalGuard;
  ~SyncVmModeGuard() = default;
  static bool IsCurrentSyncVmMode() {
    const auto& opt_sync_mode = Current();
    return opt_sync_mode.has_value() && CHECK_JUST(opt_sync_mode) == SyncVmMode::kEnable;
  }
 };
 }  // namespace oneflow
 #endif  // ONEFLOW_CORE_VM_SYNC_VM_MODE_GUARD_H_
\ No newline at end of file
--- a/oneflow/user/kernels/adaptive_pool_gpu_kernel.hip.cpp
+++ b/oneflow/user/kernels/adaptive_pool_gpu_kernel.hip.cpp
 /*
 Copyright 2020 The OneFlow Authors. All rights reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 */
 #include "hip/hip_runtime.h"
 #include "oneflow/core/device/cuda_util.h"
 #include "oneflow/core/framework/framework.h"
 #include "oneflow/core/kernel/kernel_util.hip.h"
 #include "oneflow/core/common/data_type.h"
 #include "oneflow/core/kernel/util/cuda_half_util.h"
 #include "oneflow/core/hip/atomic.hip.h"
 #include "oneflow/core/operator/operator_util.h"
 #include "oneflow/user/utils/pool_util.h"
 #include <algorithm>
 #include <cfloat>
 #include <cmath>
 namespace oneflow {
 namespace user_op {
 #define START_IND(a, b, c) (int)std::floor((float)(a * c) / b)
 #define END_IND(a, b, c) (int)std::ceil((float)((a + 1) * c) / b)
 #define START_IND_INT(a, b, c) ((a * c) / b)
 #define END_IND_INT(a, b, c) (((a + 1) * c + b - 1) / b)
 template<typename T>
 __global__ void InitPtr(int elements, T* ptr) {
  int gid = (blockDim.x * blockIdx.x) + threadIdx.x;
  int step = gridDim.x * blockDim.x;
  while (gid < elements) {
    ptr[gid] = static_cast<T>(0);
    gid += step;
  }
 }
 inline Shape GetShape5D(const Shape& shape, const std::string& data_format, int32_t dim) {
  FixedDimVector shape_3d = {GetInDim(shape, data_format, 0, dim),
                             GetInDim(shape, data_format, 1, dim),
                             GetInDim(shape, data_format, 2, dim)};
  return Shape({shape.At(0), shape.At(1), shape_3d.at(0), shape_3d.at(1), shape_3d.at(2)});
 }
 template<typename T>
 __global__ void AdaptiveAvgPoolCudaKernel(const T* input, T* output, int num_elems, int in_d,
                                          int in_h, int in_w, int out_d, int out_h, int out_w) {
  const int out_panel_size = out_d * out_h * out_w;
  const int in_panel_size = in_d * in_h * in_w;
  CUDA_1D_KERNEL_LOOP(idx, num_elems) {
    // TODO (Tianyu): Replace following codes with 'NdIndexOffsetHelper'
    int bc_idx = idx / out_panel_size;
    int out_d_idx = (idx % out_panel_size) / out_w / out_h;
    int out_h_idx = (idx % out_panel_size) % (out_h * out_w) / out_w;
    int out_w_idx = (idx % out_panel_size) % (out_h * out_w) % out_w;
    int in_start_d = START_IND(out_d_idx, out_d, in_d);
    int in_end_d = END_IND(out_d_idx, out_d, in_d);
    int k_d = in_end_d - in_start_d;
    int in_start_h = START_IND(out_h_idx, out_h, in_h);
    int in_end_h = END_IND(out_h_idx, out_h, in_h);
    int k_h = in_end_h - in_start_h;
    int in_start_w = START_IND(out_w_idx, out_w, in_w);
    int in_end_w = END_IND(out_w_idx, out_w, in_w);
    int k_w = in_end_w - in_start_w;
    const T* in_ptr =
        input + bc_idx * in_panel_size + in_start_d * in_h * in_w + in_start_h * in_w + in_start_w;
    T sum = static_cast<T>(0);
    for (int id = 0; id < k_d; ++id) {
      for (int ih = 0; ih < k_h; ++ih) {
        for (int iw = 0; iw < k_w; ++iw) {
          T val = *(in_ptr + ih * in_w + iw);
          sum += val;
        }
      }
      in_ptr += in_h * in_w;  // next input depth
    }
    // Update output
    output[idx] = sum / k_d / k_h / k_w;
  }
 }
 template<typename T>
 __global__ void AdaptiveAvgPoolGradCudaKernel(T* input, const T* output, int num_elems, int in_d,
                                              int in_h, int in_w, int out_d, int out_h, int out_w) {
  const int out_panel_size = out_d * out_h * out_w;
  const int in_panel_size = in_d * in_h * in_w;
  CUDA_1D_KERNEL_LOOP(idx, num_elems) {
    // TODO (Tianyu): Replace following codes with 'NdIndexOffsetHelper'
    int bc_idx = idx / out_panel_size;
    int out_d_idx = (idx % out_panel_size) / out_w / out_h;
    int out_h_idx = (idx % out_panel_size) % (out_h * out_w) / out_w;
    int out_w_idx = (idx % out_panel_size) % (out_h * out_w) % out_w;
    int in_start_d = START_IND(out_d_idx, out_d, in_d);
    int in_end_d = END_IND(out_d_idx, out_d, in_d);
    int k_d = in_end_d - in_start_d;
    int in_start_h = START_IND(out_h_idx, out_h, in_h);
    int in_end_h = END_IND(out_h_idx, out_h, in_h);
    int k_h = in_end_h - in_start_h;
    int in_start_w = START_IND(out_w_idx, out_w, in_w);
    int in_end_w = END_IND(out_w_idx, out_w, in_w);
    int k_w = in_end_w - in_start_w;
    const T grad_delta = output[idx] / k_d / k_h / k_w;
    T* input_ptr =
        input + bc_idx * in_panel_size + in_start_d * in_h * in_w + in_start_h * in_w + in_start_w;
    for (int id = 0; id < k_d; ++id) {
      for (int ih = 0; ih < k_h; ++ih) {
        for (int iw = 0; iw < k_w; ++iw) {
          // TODO (Tianyu): Use 'atmoic::Add' when necessary
          cuda::atomic::Add(input_ptr + ih * in_w + iw, grad_delta);
        }
      }
      input_ptr += in_h * in_w;  // next input depth
    }
  }
 }
 template<typename T>
 void AvgForwardCompute(KernelComputeContext* ctx, const int32_t& dim) {
  const Tensor* in_tensor = ctx->Tensor4ArgNameAndIndex("x", 0);
  Tensor* out_tensor = ctx->Tensor4ArgNameAndIndex("y", 0);
  const T* in_ptr = in_tensor->dptr<T>();
  T* out_ptr = out_tensor->mut_dptr<T>();
  const Shape& x_shape = ctx->TensorDesc4ArgNameAndIndex("x", 0)->shape();
  const Shape& y_shape = ctx->TensorDesc4ArgNameAndIndex("y", 0)->shape();
  // TODO (Tianyu): Support 'channels_last'
  std::string data_format = "channels_first";
  const Shape& in = GetShape5D(x_shape, data_format, dim);
  const Shape& out = GetShape5D(y_shape, data_format, dim);
  const int out_elems = out_tensor->shape_view().elem_cnt();
  RUN_CUDA_KERNEL((AdaptiveAvgPoolCudaKernel<T>), ctx->stream(), out_elems, in_ptr, out_ptr,
                  out_elems, in.At(2), in.At(3), in.At(4), out.At(2), out.At(3), out.At(4));
 }
 template<typename T>
 void AvgBackwardCompute(KernelComputeContext* ctx, const int32_t& dim) {
  const Tensor* out_tensor = ctx->Tensor4ArgNameAndIndex("dy", 0);
  Tensor* in_tensor = ctx->Tensor4ArgNameAndIndex("dx", 0);
  const T* out_ptr = out_tensor->dptr<T>();
  T* in_ptr = in_tensor->mut_dptr<T>();
  const Shape& dx_shape = ctx->TensorDesc4ArgNameAndIndex("dx", 0)->shape();
  const Shape& dy_shape = ctx->TensorDesc4ArgNameAndIndex("dy", 0)->shape();
  // TODO (Tianyu): Support 'channels_last'
  std::string data_format = "channels_first";
  const Shape& in = GetShape5D(dx_shape, data_format, dim);
  const Shape& out = GetShape5D(dy_shape, data_format, dim);
  const int in_elems = in_tensor->shape_view().elem_cnt();
  const int out_elems = out_tensor->shape_view().elem_cnt();
  RUN_CUDA_KERNEL((InitPtr<T>), ctx->stream(), in_elems, in_elems, in_ptr);
  RUN_CUDA_KERNEL((AdaptiveAvgPoolGradCudaKernel<T>), ctx->stream(), out_elems, in_ptr, out_ptr,
                  out_elems, in.At(2), in.At(3), in.At(4), out.At(2), out.At(3), out.At(4));
 }
 template<DeviceType device_type, typename T>
 class GpuAdaptiveAvgPool1dKernel final : public OpKernel {
 public:
  GpuAdaptiveAvgPool1dKernel() = default;
  ~GpuAdaptiveAvgPool1dKernel() = default;
 private:
  using user_op::OpKernel::Compute;
  void Compute(KernelComputeContext* ctx) const override { AvgForwardCompute<T>(ctx, 1); }
  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
 };
 template<DeviceType device_type, typename T>
 class GpuAdaptiveAvgPool2dKernel final : public OpKernel {
 public:
  GpuAdaptiveAvgPool2dKernel() = default;
  ~GpuAdaptiveAvgPool2dKernel() = default;
 private:
  using user_op::OpKernel::Compute;
  void Compute(KernelComputeContext* ctx) const override { AvgForwardCompute<T>(ctx, 2); }
  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
 };
 template<DeviceType device_type, typename T>
 class GpuAdaptiveAvgPool3dKernel final : public OpKernel {
 public:
  GpuAdaptiveAvgPool3dKernel() = default;
  ~GpuAdaptiveAvgPool3dKernel() = default;
 private:
  using user_op::OpKernel::Compute;
  void Compute(KernelComputeContext* ctx) const override { AvgForwardCompute<T>(ctx, 3); }
  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
 };
 template<DeviceType device_type, typename T>
 class GpuAdaptiveAvgPool1dGradKernel final : public OpKernel {
 public:
  GpuAdaptiveAvgPool1dGradKernel() = default;
  ~GpuAdaptiveAvgPool1dGradKernel() = default;
 private:
  using user_op::OpKernel::Compute;
  void Compute(KernelComputeContext* ctx) const override { AvgBackwardCompute<T>(ctx, 1); }
  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
 };
 template<DeviceType device_type, typename T>
 class GpuAdaptiveAvgPool2dGradKernel final : public OpKernel {
 public:
  GpuAdaptiveAvgPool2dGradKernel() = default;
  ~GpuAdaptiveAvgPool2dGradKernel() = default;
 private:
  using user_op::OpKernel::Compute;
  void Compute(KernelComputeContext* ctx) const override { AvgBackwardCompute<T>(ctx, 2); }
  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
 };
 template<DeviceType device_type, typename T>
 class GpuAdaptiveAvgPool3dGradKernel final : public OpKernel {
 public:
  GpuAdaptiveAvgPool3dGradKernel() = default;
  ~GpuAdaptiveAvgPool3dGradKernel() = default;
 private:
  using user_op::OpKernel::Compute;
  void Compute(KernelComputeContext* ctx) const override { AvgBackwardCompute<T>(ctx, 3); }
  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
 };
 #define REGISTER_CUDA_ADAPTIVE_AVGPOOL_KERNEL(device, dtype)                   \
  REGISTER_USER_KERNEL("adaptive_avg_pool1d")                                  \
      .SetCreateFn<GpuAdaptiveAvgPool1dKernel<device, dtype>>()                \
      .SetIsMatchedHob((HobDeviceType() == device)                             \
                       && (HobDataType("y", 0) == GetDataType<dtype>::value)); \
  REGISTER_USER_KERNEL("adaptive_avg_pool2d")                                  \
      .SetCreateFn<GpuAdaptiveAvgPool2dKernel<device, dtype>>()                \
      .SetIsMatchedHob((HobDeviceType() == device)                             \
                       && (HobDataType("y", 0) == GetDataType<dtype>::value)); \
  REGISTER_USER_KERNEL("adaptive_avg_pool3d")                                  \
      .SetCreateFn<GpuAdaptiveAvgPool3dKernel<device, dtype>>()                \
      .SetIsMatchedHob((HobDeviceType() == device)                             \
                       && (HobDataType("y", 0) == GetDataType<dtype>::value));
 REGISTER_CUDA_ADAPTIVE_AVGPOOL_KERNEL(DeviceType::kCUDA, float);
 REGISTER_CUDA_ADAPTIVE_AVGPOOL_KERNEL(DeviceType::kCUDA, double);
 REGISTER_CUDA_ADAPTIVE_AVGPOOL_KERNEL(DeviceType::kCUDA, int);
 #define REGISTER_CUDA_ADAPTIVE_AVGPOOL_BACKWARD_KERNEL(device, dtype)           \
  REGISTER_USER_KERNEL("adaptive_avg_pool1d_grad")                              \
      .SetCreateFn<GpuAdaptiveAvgPool1dGradKernel<device, dtype>>()             \
      .SetIsMatchedHob((HobDeviceType() == device)                              \
                       && (HobDataType("dx", 0) == GetDataType<dtype>::value)); \
  REGISTER_USER_KERNEL("adaptive_avg_pool2d_grad")                              \
      .SetCreateFn<GpuAdaptiveAvgPool2dGradKernel<device, dtype>>()             \
      .SetIsMatchedHob((HobDeviceType() == device)                              \
                       && (HobDataType("dx", 0) == GetDataType<dtype>::value)); \
  REGISTER_USER_KERNEL("adaptive_avg_pool3d_grad")                              \
      .SetCreateFn<GpuAdaptiveAvgPool3dGradKernel<device, dtype>>()             \
      .SetIsMatchedHob((HobDeviceType() == device)                              \
                       && (HobDataType("dx", 0) == GetDataType<dtype>::value));
 REGISTER_CUDA_ADAPTIVE_AVGPOOL_BACKWARD_KERNEL(DeviceType::kCUDA, float);
 REGISTER_CUDA_ADAPTIVE_AVGPOOL_BACKWARD_KERNEL(DeviceType::kCUDA, double);
 REGISTER_CUDA_ADAPTIVE_AVGPOOL_BACKWARD_KERNEL(DeviceType::kCUDA, int);
 }  // namespace user_op
 }  // namespace oneflow
\ No newline at end of file
--- a/oneflow/user/kernels/affine_grid_kernel.hip.cpp
+++ b/oneflow/user/kernels/affine_grid_kernel.hip.cpp
 /*
 Copyright 2020 The OneFlow Authors. All rights reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 */
 #include "hip/hip_runtime.h"
 #include "oneflow/core/kernel/new_kernel_util.h"
 #include "oneflow/core/kernel/kernel_util.h"
 #include "oneflow/core/device/cuda_util.h"
 #include "affine_grid_kernel.h"
 namespace oneflow {
 namespace {
 template<typename data_type, bool align_corners>
 OF_DEVICE_FUNC data_type LinspaceGPU(int32_t index, int32_t num_steps) {
  if (num_steps <= 1) { return static_cast<data_type>(0.0); }
  if (align_corners) {
    return static_cast<data_type>(-1.0 + 2.0 / (num_steps - 1) * index);
  } else {
    return static_cast<data_type>((-1.0 + 2.0 / (num_steps - 1) * index) * (num_steps - 1)
                                  / num_steps);
  }
 }
 template<typename data_type, bool align_corners>
 __global__ void Generate2DBaseGridGPUKernel(const int32_t nthreads, data_type* grid_ptr, int32_t H,
                                            int32_t W) {
  CUDA_1D_KERNEL_LOOP(index, nthreads) {
    const int32_t h = index / W;
    const int32_t w = index % W;
    const int32_t pixel_length = 3;
    data_type* row_ptr = grid_ptr + h * W * pixel_length;
    data_type* pixel_ptr = row_ptr + w * pixel_length;
    data_type h_value = LinspaceGPU<data_type, align_corners>(h, H);
    data_type w_value = LinspaceGPU<data_type, align_corners>(w, W);
    pixel_ptr[0] = w_value;
    pixel_ptr[1] = h_value;
    pixel_ptr[2] = static_cast<data_type>(1.0);
  }
 }
 template<typename data_type, bool align_corners>
 __global__ void Generate3DBaseGridGPUKernel(const int32_t nthreads, data_type* grid_ptr, int32_t D,
                                            int32_t H, int32_t W) {
  CUDA_1D_KERNEL_LOOP(index, nthreads) {
    const int32_t d = index / H;
    const int32_t h = index % H;
    const int32_t pixel_length = 4;
    data_type* image_ptr = grid_ptr + d * H * W * pixel_length;
    data_type* row_ptr = image_ptr + h * W * pixel_length;
    data_type d_value = LinspaceGPU<data_type, align_corners>(d, D);
    data_type h_value = LinspaceGPU<data_type, align_corners>(h, H);
    for (int32_t w = 0; w < W; ++w) {
      data_type* pixel_ptr = row_ptr + w * pixel_length;
      data_type w_value = LinspaceGPU<data_type, align_corners>(w, W);
      pixel_ptr[0] = w_value;
      pixel_ptr[1] = h_value;
      pixel_ptr[2] = d_value;
      pixel_ptr[3] = static_cast<data_type>(1.0);
    }
  }
 }
 }  // namespace
 void GenerateBaseGridImp<DeviceType::kCUDA>::Generate2D(user_op::KernelComputeContext* ctx,
                                                        float* grid_ptr, int64_t H, int64_t W,
                                                        bool align_corners) {
  int count = H * W;
  if (align_corners) {
    RUN_CUDA_KERNEL((Generate2DBaseGridGPUKernel<float, true>), ctx->stream(), count, count,
                    grid_ptr, H, W);
  } else {
    RUN_CUDA_KERNEL((Generate2DBaseGridGPUKernel<float, false>), ctx->stream(), count, count,
                    grid_ptr, H, W);
  }
 }
 void GenerateBaseGridImp<DeviceType::kCUDA>::Generate2D(user_op::KernelComputeContext* ctx,
                                                        double* grid_ptr, int64_t H, int64_t W,
                                                        bool align_corners) {
  int count = H * W;
  if (align_corners) {
    RUN_CUDA_KERNEL((Generate2DBaseGridGPUKernel<double, true>), ctx->stream(), count, count,
                    grid_ptr, H, W);
  } else {
    RUN_CUDA_KERNEL((Generate2DBaseGridGPUKernel<double, false>), ctx->stream(), count, count,
                    grid_ptr, H, W);
  }
 }
 void GenerateBaseGridImp<DeviceType::kCUDA>::Generate3D(user_op::KernelComputeContext* ctx,
                                                        float* grid_ptr, int64_t D, int64_t H,
                                                        int64_t W, bool align_corners) {
  int count = D * H;
  if (align_corners) {
    RUN_CUDA_KERNEL((Generate3DBaseGridGPUKernel<float, true>), ctx->stream(), count, count,
                    grid_ptr, D, H, W);
  } else {
    RUN_CUDA_KERNEL((Generate3DBaseGridGPUKernel<float, false>), ctx->stream(), count, count,
                    grid_ptr, D, H, W);
  }
 }
 void GenerateBaseGridImp<DeviceType::kCUDA>::Generate3D(user_op::KernelComputeContext* ctx,
                                                        double* grid_ptr, int64_t D, int64_t H,
                                                        int64_t W, bool align_corners) {
  int count = D * H;
  if (align_corners) {
    RUN_CUDA_KERNEL((Generate3DBaseGridGPUKernel<double, true>), ctx->stream(), count, count,
                    grid_ptr, D, H, W);
  } else {
    RUN_CUDA_KERNEL((Generate3DBaseGridGPUKernel<double, false>), ctx->stream(), count, count,
                    grid_ptr, D, H, W);
  }
 }
 }  // namespace oneflow
\ No newline at end of file
--- a/oneflow/user/kernels/arange_kernel_util.hip.cpp
+++ b/oneflow/user/kernels/arange_kernel_util.hip.cpp
 /*
 Copyright 2020 The OneFlow Authors. All rights reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 */
 #ifdef WITH_ROCM
 #include "hip/hip_runtime.h"
 #include "oneflow/core/framework/framework.h"
 #include "oneflow/user/kernels/arange_kernel_util.h"
 namespace oneflow {
 namespace user_op {
 template<typename T>
 __global__ void ArangeForwardGpuKernel(const T start, const T delta, const int64_t arange_elem_cnt,
                                       T* out) {
  // Use Loop to set the value
  DoArange<T>(start, delta, arange_elem_cnt, out);
 }
 template<typename T>
 struct ArangeFunctor<DeviceType::kCUDA, T> final {
  void operator()(ep::Stream* stream, const T start, const T delta, const int64_t arange_elem_cnt,
                  T* out) {
    // The thread num is set as arange_elem_cnt
    RUN_CUDA_KERNEL((ArangeForwardGpuKernel<T>), stream, arange_elem_cnt, start, delta,
                    arange_elem_cnt, out);
  }
 };
 OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INSTANTIATE_ARANGE_FUNCTOR, (DeviceType::kCUDA),
                                 ARANGE_DATA_TYPE_SEQ);
 }  // namespace user_op
 }  // namespace oneflow
 #endif  // End WITH_ROCM
\ No newline at end of file
--- a/oneflow/user/kernels/arg_sort_kernel.hip.cpp
+++ b/oneflow/user/kernels/arg_sort_kernel.hip.cpp
 /*
 Copyright 2020 The OneFlow Authors. All rights reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 */
 #include "hip/hip_runtime.h"
 #include "oneflow/core/framework/framework.h"
 #include "oneflow/core/kernel/new_kernel_util.h"
 #include "oneflow/user/kernels/radix_sort.hip.h"
 #include "oneflow/core/ep/rocm/cuda_stream.h"
 namespace oneflow {
 namespace {
 template<typename T>
 class TmpBufferManager final {
 public:
  OF_DISALLOW_COPY_AND_MOVE(TmpBufferManager);
  TmpBufferManager(int32_t capacity, void* ptr, const ShapeView& in_shape)
      : capacity_{capacity},
        sorted_in_elem_cnt_{in_shape.elem_cnt()},
        indices_elem_cnt_{sorted_in_elem_cnt_} {
    const int32_t sorted_in_aligned_bytes = GetCudaAlignedSize(sorted_in_elem_cnt_ * sizeof(T));
    const int32_t indices_aligned_bytes = GetCudaAlignedSize(indices_elem_cnt_ * sizeof(int32_t));
    sorted_in_ptr_ = reinterpret_cast<T*>(ptr);
    indices_ptr_ = reinterpret_cast<int32_t*>(reinterpret_cast<char*>(sorted_in_ptr_)
                                              + sorted_in_aligned_bytes);
    temp_storage_ptr_ =
        reinterpret_cast<void*>(reinterpret_cast<char*>(indices_ptr_) + indices_aligned_bytes);
    temp_storage_bytes_ = capacity_ - sorted_in_aligned_bytes - indices_aligned_bytes;
    CHECK_GE(temp_storage_bytes_, 0);
  }
  ~TmpBufferManager() = default;
  T* SortedInPtr() const { return sorted_in_ptr_; }
  int32_t* IndicesPtr() const { return indices_ptr_; }
  void* TempStoragePtr() const { return temp_storage_ptr_; }
  int32_t TempStorageBytes() const { return temp_storage_bytes_; }
 private:
  int32_t capacity_;
  T* sorted_in_ptr_;
  int32_t* indices_ptr_;
  void* temp_storage_ptr_;
  int64_t sorted_in_elem_cnt_;
  int64_t indices_elem_cnt_;
  int32_t temp_storage_bytes_;
 };
 __global__ void InitializeIndices(int32_t elem_cnt, int32_t* indices_ptr, int32_t instance_size) {
  CUDA_1D_KERNEL_LOOP(i, elem_cnt) { indices_ptr[i] = i % instance_size; };
 }
 }  // namespace
 template<typename T>
 class GpuArgSortKernel final : public user_op::OpKernel {
 public:
  GpuArgSortKernel() = default;
  ~GpuArgSortKernel() = default;
 private:
  using user_op::OpKernel::Compute;
  void Compute(user_op::KernelComputeContext* ctx) const override {
    const user_op::Tensor* in = ctx->Tensor4ArgNameAndIndex("in", 0);
    user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0);
    user_op::Tensor* tmp_buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0);
    TmpBufferManager<T> buf_manager(static_cast<int32_t>(tmp_buffer->shape_view().elem_cnt()),
                                    tmp_buffer->mut_dptr<void>(), in->shape_view());
    const int32_t elem_cnt = in->shape_view().elem_cnt();
    const int32_t instance_size = in->shape_view().At(in->shape_view().NumAxes() - 1);
    const int32_t instance_num = elem_cnt / instance_size;
    const std::string& direction = ctx->Attr<std::string>("direction");
    InitializeIndices<<<BlocksNum4ThreadsNum(elem_cnt), kCudaThreadsNumPerBlock, 0,
                        ctx->stream()->As<ep::CudaStream>()->cuda_stream()>>>(
        elem_cnt, buf_manager.IndicesPtr(), instance_size);
    if (direction == "ASCENDING") {
      SortPairsAscending(in->dptr<T>(), buf_manager.IndicesPtr(), instance_num, instance_size,
                         buf_manager.TempStoragePtr(), buf_manager.TempStorageBytes(),
                         buf_manager.SortedInPtr(), out->mut_dptr<int32_t>(),
                         ctx->stream()->As<ep::CudaStream>()->cuda_stream());
    } else if (direction == "DESCENDING") {
      SortPairsDescending(in->dptr<T>(), buf_manager.IndicesPtr(), instance_num, instance_size,
                          buf_manager.TempStoragePtr(), buf_manager.TempStorageBytes(),
                          buf_manager.SortedInPtr(), out->mut_dptr<int32_t>(),
                          ctx->stream()->As<ep::CudaStream>()->cuda_stream());
    } else {
      UNIMPLEMENTED();
    }
  }
  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
 };
 #define REGISTER_CUDA_ARG_SORT_KERNEL(dtype)                                                       \
  REGISTER_USER_KERNEL("arg_sort")                                                                 \
      .SetCreateFn<GpuArgSortKernel<dtype>>()                                                      \
      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA)                             \
                       && (user_op::HobDataType("in", 0) == GetDataType<dtype>::value))            \
      .SetInferTmpSizeFn([](user_op::InferContext* ctx) {                                          \
        const Shape& in_shape = ctx->InputShape("in", 0);                                          \
        const int32_t elem_cnt = in_shape.elem_cnt();                                              \
        const int32_t instance_size = in_shape.dim_vec().back();                                   \
        const int32_t instance_num = elem_cnt / instance_size;                                     \
                                                                                                   \
        /* Sorted In */                                                                            \
        const int32_t sorted_in_aligned_bytes = GetCudaAlignedSize(elem_cnt * sizeof(dtype));      \
        /* Indices */                                                                              \
        const int32_t indices_aligned_bytes = GetCudaAlignedSize(elem_cnt * sizeof(int32_t));      \
        /* CUB Temp Storage */                                                                     \
        int32_t temp_storage_bytes = -1;                                                           \
        const std::string& direction = ctx->Attr<std::string>("direction");                        \
        if (direction == "ASCENDING") {                                                            \
          temp_storage_bytes =                                                                     \
              InferTempStorageForSortPairsAscending<dtype, int32_t>(instance_num, instance_size);  \
        } else if (direction == "DESCENDING") {                                                    \
          temp_storage_bytes =                                                                     \
              InferTempStorageForSortPairsDescending<dtype, int32_t>(instance_num, instance_size); \
        } else {                                                                                   \
          UNIMPLEMENTED();                                                                         \
        }                                                                                          \
                                                                                                   \
        return sorted_in_aligned_bytes + indices_aligned_bytes + temp_storage_bytes;               \
      });
 REGISTER_CUDA_ARG_SORT_KERNEL(float)
 REGISTER_CUDA_ARG_SORT_KERNEL(double)
 REGISTER_CUDA_ARG_SORT_KERNEL(bool)
 REGISTER_CUDA_ARG_SORT_KERNEL(int8_t)
 REGISTER_CUDA_ARG_SORT_KERNEL(uint8_t)
 REGISTER_CUDA_ARG_SORT_KERNEL(int32_t)
 REGISTER_CUDA_ARG_SORT_KERNEL(int64_t)
 }  // namespace oneflow
\ No newline at end of file
--- a/oneflow/user/kernels/arg_where_kernel_util.hip.cpp
+++ b/oneflow/user/kernels/arg_where_kernel_util.hip.cpp
 /*
 Copyright 2020 The OneFlow Authors. All rights reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 */
 #include "hip/hip_runtime.h"
 #include "oneflow/user/kernels/arg_where_kernel_util.h"
 #include "oneflow/core/common/nd_index_offset_helper.h"
 #include "oneflow/core/common/small_vector.h"
 #include "oneflow/core/hip/elementwise.hip.h"
 #include "oneflow/core/kernel/kernel_util.h"
 #include "oneflow/core/ep/rocm/cuda_stream.h"
 #include <hipcub/hipcub.hpp>
 namespace oneflow {
 namespace {
 constexpr int kBlockSize = cuda::elementwise::kBlockSize;
 int GetNumBlocks(int64_t elem_cnt) {
  int num_blocks = 0;
  OF_CUDA_CHECK(cuda::elementwise::GetNumBlocks(elem_cnt, &num_blocks));
  return num_blocks;
 }
 template<typename T, int NDIM>
 struct StrideIterator {
  typedef StrideIterator self_type;
  typedef std::ptrdiff_t difference_type;
  typedef T value_type;
  typedef T* pointer;
  typedef T& reference;
  typedef std::random_access_iterator_tag iterator_category;
  explicit StrideIterator(T* ptr, size_t max_iters) : ptr_(ptr), max_iters_(max_iters) {}
  OF_DEVICE_FUNC reference operator[](int i) {
    assert(0 <= i && i < max_iters_);
    return *(ptr_ + (i * NDIM));
  }
 private:
  T* ptr_;
  size_t max_iters_;
 };
 template<typename T, int NDIM>
 __global__ void __launch_bounds__(kBlockSize)
    CudaOffsetToNdIndexInplace(NdIndexOffsetHelper<T, NDIM> index_converter,
                               const T* output_size_ptr, T* output_ptr) {
  CUDA_1D_KERNEL_LOOP_T(T, i, *output_size_ptr) {
    T* index_ptr = output_ptr + i * NDIM;
    index_converter.OffsetToNdIndex(*index_ptr, index_ptr);
  }
 }
 template<typename T>
 struct IsTrue {
  __device__ __forceinline__ bool operator()(const T& val) const { return static_cast<bool>(val); }
 };
 template<typename IN_T, typename OUT_T, typename OUT_ITER>
 hipError_t SelectTrue(hipStream_t stream, int num_items, void* temp_storage,
                       size_t& temp_storage_bytes, const IN_T* input, OUT_ITER output_iter,
                       OUT_T* num_selected) {
  IsTrue<IN_T> is_true;
  hipcub::TransformInputIterator<bool, IsTrue<IN_T>, const IN_T*> flag_iter(input, is_true);
  hipcub::CountingInputIterator<OUT_T> offset_counter(0);
  return hipcub::DeviceSelect::Flagged(temp_storage, temp_storage_bytes, offset_counter, flag_iter,
                                    output_iter, num_selected, num_items, stream, false);
 }
 }  // namespace
 template<typename IN_T, typename OUT_T, int NDIM>
 struct ArgWhereKernelUtil<DeviceType::kCUDA, IN_T, OUT_T, NDIM> {
  static void ArgWhere(ep::Stream* stream, const ShapeView& input_shape, const IN_T* input_ptr,
                       void* temp_storage, size_t temp_storage_bytes, OUT_T* output_ptr,
                       OUT_T* output_size_ptr) {
    const int64_t elem_cnt = input_shape.elem_cnt();
    // deal with empty blob
    if (elem_cnt == 0) {
      Memset<DeviceType::kCUDA>(stream, output_size_ptr, 0, sizeof(OUT_T));
      return;
    }
    CHECK_NOTNULL(stream);
    CHECK_LE(elem_cnt, std::numeric_limits<OUT_T>::max());
    size_t workspace = GetWorkspaceBytesSize(stream, elem_cnt);
    CHECK_LE(workspace, temp_storage_bytes);
    if (NDIM == 1) {
      OF_CUDA_CHECK((SelectTrue<IN_T, OUT_T, OUT_T*>(
          stream->As<ep::CudaStream>()->cuda_stream(), input_shape.elem_cnt(), temp_storage,
          workspace, input_ptr, output_ptr, output_size_ptr)));
    } else {
      using OutputIterator = StrideIterator<OUT_T, NDIM>;
      OutputIterator output_iter(output_ptr, elem_cnt);
      OF_CUDA_CHECK((SelectTrue<IN_T, OUT_T, OutputIterator>(
          stream->As<ep::CudaStream>()->cuda_stream(), elem_cnt, temp_storage, workspace, input_ptr,
          output_iter, output_size_ptr)));
      OUT_T dims[NDIM] = {0};
      std::transform(input_shape.ptr(), input_shape.ptr() + input_shape.NumAxes(), dims,
                     [](int64_t dim) { return static_cast<OUT_T>(dim); });
      NdIndexOffsetHelper<OUT_T, NDIM> index_converter(dims);
      CudaOffsetToNdIndexInplace<OUT_T, NDIM>
          <<<GetNumBlocks(elem_cnt), kBlockSize, 0, stream->As<ep::CudaStream>()->cuda_stream()>>>(
              index_converter, output_size_ptr, output_ptr);
    }
  }
  static size_t GetWorkspaceBytesSize(ep::Stream* stream, int64_t elem_cnt) {
    hipStream_t cuda_stream = stream ? stream->As<ep::CudaStream>()->cuda_stream() : 0;
    size_t workspace = 0;
    if (NDIM == 1) {
      OF_CUDA_CHECK((SelectTrue<IN_T, OUT_T, OUT_T*>(cuda_stream, elem_cnt, nullptr, workspace,
                                                     nullptr, nullptr, nullptr)));
    } else {
      using OutputIterator = StrideIterator<OUT_T, NDIM>;
      OutputIterator output_iter(nullptr, elem_cnt);
      OF_CUDA_CHECK((SelectTrue<IN_T, OUT_T, OutputIterator>(
          cuda_stream, elem_cnt, nullptr, workspace, nullptr, output_iter, nullptr)));
    }
    return workspace;
  }
 };
 INSTANTIATE_ARG_WHERE_KERNEL_UTIL_FOR_DEVICE(DeviceType::kCUDA)
 }  // namespace oneflow
\ No newline at end of file
--- a/oneflow/user/kernels/argmax_kernel.hip.cpp
+++ b/oneflow/user/kernels/argmax_kernel.hip.cpp
 /*
 Copyright 2020 The OneFlow Authors. All rights reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 */
 #include "hip/hip_runtime.h"
 #include "oneflow/core/framework/framework.h"
 #include <hipcub/hipcub.hpp>
 #include "oneflow/core/ep/rocm/cuda_stream.h"
 namespace oneflow {
 namespace {
 template<typename T>
 class TmpBufferManager final {
 public:
  OF_DISALLOW_COPY_AND_MOVE(TmpBufferManager);
  TmpBufferManager(int32_t capacity, void* ptr, int32_t instance_num)
      : capacity_{capacity}, key_value_out_elem_cnt_{instance_num} {
    const int32_t key_value_out_aligned_bytes =
        GetCudaAlignedSize(key_value_out_elem_cnt_ * sizeof(hipcub::KeyValuePair<int32_t, T>));
    key_value_out_ptr_ = reinterpret_cast<hipcub::KeyValuePair<int32_t, T>*>(ptr);
    temp_storage_ptr_ = reinterpret_cast<void*>(reinterpret_cast<char*>(key_value_out_ptr_)
                                                + key_value_out_aligned_bytes);
    temp_storage_bytes_ = capacity_ - key_value_out_aligned_bytes;
    CHECK_GE(temp_storage_bytes_, 0);
  }
  ~TmpBufferManager() = default;
  hipcub::KeyValuePair<int32_t, T>* KeyValueOutPtr() const { return key_value_out_ptr_; }
  void* TempStoragePtr() const { return temp_storage_ptr_; }
  int32_t TempStorageBytes() const { return temp_storage_bytes_; }
 private:
  int32_t capacity_;
  hipcub::KeyValuePair<int32_t, T>* key_value_out_ptr_;
  void* temp_storage_ptr_;
  int32_t key_value_out_elem_cnt_;
  int32_t temp_storage_bytes_;
 };
 class MultiplyFunctor final {
 public:
  MultiplyFunctor(int32_t num_col) : num_col_(num_col) {}
  __host__ __device__ __forceinline__ int32_t operator()(int32_t idx) const {
    return idx * num_col_;
  }
 private:
  int32_t num_col_;
 };
 template<typename T>
 size_t InferTempStorageForArgMax(int32_t num_row, int32_t num_col) {
  using SegmentOffsetIter =
      hipcub::TransformInputIterator<int32_t, MultiplyFunctor, hipcub::CountingInputIterator<int32_t>>;
  hipcub::CountingInputIterator<int32_t> counting_iter(0);
  MultiplyFunctor multiply_functor(num_col);
  SegmentOffsetIter segment_offset_iter(counting_iter, multiply_functor);
  size_t temp_storage_bytes = 0;
  auto err =
      hipcub::DeviceSegmentedReduce::ArgMax<T*, hipcub::KeyValuePair<int32_t, T>*, SegmentOffsetIter>(
          /* d_temp_storage */ nullptr, /* temp_storage_bytes */ temp_storage_bytes,
          /* d_in */ nullptr, /* d_out */ nullptr, /* num_segments */ num_row,
          /* d_begin_offsets */ segment_offset_iter, /* d_end_offsets */ segment_offset_iter + 1,
          /* stream */ 0);
  // auto err =
  //   hipcub::DeviceReduce::ArgMax<T*, hipcub::KeyValuePair<int32_t, T>*>(
  //                   nullptr, temp_storage_bytes,
  //                   nullptr, nullptr, num_row,
  //                   0);
  OF_CUDA_CHECK(err);
  return temp_storage_bytes;
 }
 template<typename T>
 void ArgMax(const T* in_ptr, int32_t num_row, int32_t num_col, void* temp_storage_ptr,
            int32_t temp_storage_bytes, hipcub::KeyValuePair<int32_t, T>* out_ptr,
            hipStream_t stream) {
  size_t rt_inferred_temp_storage_bytes = InferTempStorageForArgMax<T>(num_row, num_col);
  CHECK_LE(rt_inferred_temp_storage_bytes, temp_storage_bytes);
  using SegmentOffsetIter =
      hipcub::TransformInputIterator<int32_t, MultiplyFunctor, hipcub::CountingInputIterator<int32_t>>;
  hipcub::CountingInputIterator<int32_t> counting_iter(0);
  MultiplyFunctor multiply_functor(num_col);
  SegmentOffsetIter segment_offset_iter(counting_iter, multiply_functor);
  // void * d_temp_storage = nullptr;
  // hipMalloc((void **)&d_temp_storage, rt_inferred_temp_storage_bytes);
  auto err = hipcub::DeviceSegmentedReduce::ArgMax(
      /* d_temp_storage */ temp_storage_ptr,
      /* temp_storage_bytes */ rt_inferred_temp_storage_bytes,
      /* d_in */ in_ptr,
      /* d_out */ out_ptr,
      /* num_segments */ num_row,
      /* d_begin_offsets */ segment_offset_iter,
      /* d_end_offsets */ segment_offset_iter + 1,
      /* stream */ stream);
  // auto err =
  //   hipcub::DeviceReduce::ArgMax(
  //                   d_temp_storage, rt_inferred_temp_storage_bytes,
  //                   in_ptr, out_ptr, num_row,
  //                   stream);
  OF_CUDA_CHECK(err);
 }
 template<typename T>
 __global__ void WriteKeysToOutput(const int32_t instance_num, const int32_t instance_size, 
                                  const hipcub::KeyValuePair<int32_t, T>* key_value_out_ptr,
                                  int64_t* out_ptr) {
  CUDA_1D_KERNEL_LOOP(i, instance_num) { out_ptr[i] = key_value_out_ptr[i].key % instance_size; }
 }
 }  // namespace
 template<typename T>
 class GpuArgMaxKernel final : public user_op::OpKernel {
 public:
  GpuArgMaxKernel() = default;
  ~GpuArgMaxKernel() = default;
 private:
  using user_op::OpKernel::Compute;
  void Compute(user_op::KernelComputeContext* ctx) const override {
    const user_op::Tensor* in = ctx->Tensor4ArgNameAndIndex("in", 0);
    user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0);
    user_op::Tensor* tmp_buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0);
    const int32_t elem_cnt = in->shape_view().elem_cnt();
    const int32_t instance_size = in->shape_view().At(in->shape_view().NumAxes() - 1);
    const int32_t instance_num = elem_cnt / instance_size;
    TmpBufferManager<T> buffer_manager(tmp_buffer->shape_view().elem_cnt(),
                                       tmp_buffer->mut_dptr<void>(), instance_num);
    ArgMax(in->dptr<T>(), instance_num, instance_size, buffer_manager.TempStoragePtr(),
           buffer_manager.TempStorageBytes(), buffer_manager.KeyValueOutPtr(),
           ctx->stream()->As<ep::CudaStream>()->cuda_stream());
    WriteKeysToOutput<T><<<BlocksNum4ThreadsNum(instance_num), kCudaThreadsNumPerBlock, 0,
                           ctx->stream()->As<ep::CudaStream>()->cuda_stream()>>>(
        instance_num, instance_size, buffer_manager.KeyValueOutPtr(), out->mut_dptr<int64_t>());
  }
  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
 };
 #define REGISTER_CUDA_ARGMAX_KERNEL(dtype)                                                         \
  REGISTER_USER_KERNEL("argmax")                                                                   \
      .SetCreateFn<GpuArgMaxKernel<dtype>>()                                                       \
      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA)                             \
                       && (user_op::HobDataType("in", 0) == GetDataType<dtype>::value))            \
      .SetInferTmpSizeFn([](user_op::InferContext* ctx) {                                          \
        const Shape& in_shape = ctx->InputShape("in", 0);                                          \
        const int32_t instance_size = in_shape.dim_vec().back();                                   \
        const int32_t instance_num = in_shape.elem_cnt() / instance_size;                          \
                                                                                                   \
        /* Key-Value Out */                                                                        \
        int32_t key_value_out_bytes =                                                              \
            GetCudaAlignedSize(instance_num * sizeof(hipcub::KeyValuePair<int32_t, dtype>));          \
                                                                                                   \
        /* CUB Temp Storage */                                                                     \
        size_t temp_storage_bytes = InferTempStorageForArgMax<dtype>(instance_num, instance_size); \
                                                                                                   \
        return key_value_out_bytes + temp_storage_bytes;                                           \
      });
 REGISTER_CUDA_ARGMAX_KERNEL(float)
 REGISTER_CUDA_ARGMAX_KERNEL(double)
 REGISTER_CUDA_ARGMAX_KERNEL(uint8_t)
 REGISTER_CUDA_ARGMAX_KERNEL(int8_t)
 REGISTER_CUDA_ARGMAX_KERNEL(int32_t)
 REGISTER_CUDA_ARGMAX_KERNEL(int64_t)
 }  // namespace oneflow
\ No newline at end of file
--- a/oneflow/user/kernels/as_strided_kernel.hip.cpp
+++ b/oneflow/user/kernels/as_strided_kernel.hip.cpp
 #include "hip/hip_runtime.h"
 /*
 Copyright 2020 The OneFlow Authors. All rights reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 */
 #include <cstdint>
 #include "oneflow/core/hip/atomic.hip.h"
 #include "oneflow/core/common/just.h"
 #include "oneflow/core/common/util.h"
 #include "oneflow/core/framework/consistency_check.h"
 #include "oneflow/core/framework/framework.h"
 #include "oneflow/core/kernel/new_kernel_util.h"
 #include "oneflow/core/kernel/kernel_util.h"
 #include "oneflow/core/ep/rocm/cuda_stream.h"
 #include "oneflow/core/common/nd_index_offset_helper.h"
 namespace oneflow {
 namespace {
 constexpr size_t NUM_DIM = 8;
 template<size_t num_dims, typename IndexType>
 struct AsStridedParams {
  NdIndexOffsetHelper<IndexType, num_dims> destIndexOffsetHelper;
  int64_t dest_dims[num_dims];
  int32_t stride[num_dims];
  int32_t dest_num_dims;
  int32_t storage_offset;
  int32_t input_num;
  int32_t output_num;
 };
 template<typename T>
 __global__ void AsStrided_kernel(const T* input_buf, T* output_buf,
                                 AsStridedParams<NUM_DIM, int64_t> params) {
  const int64_t* dest_dims = reinterpret_cast<const int64_t*>(params.dest_dims);
  const int32_t* stride = reinterpret_cast<const int32_t*>(params.stride);
  CUDA_1D_KERNEL_LOOP_T(int64_t, i, params.output_num) {
    int64_t dst_index[NUM_DIM];
    params.destIndexOffsetHelper.OffsetToNdIndex(i, dst_index, params.dest_num_dims);
    int32_t index_in_input = params.storage_offset;
    FOR_RANGE(int64_t, j, 0, params.dest_num_dims) { index_in_input += dst_index[j] * stride[j]; }
    output_buf[i] = input_buf[index_in_input];
  }
 }
 template<typename T>
 __global__ void AsStridedGrad_kernel(const T* dy_buf, T* dx_buf,
                                     AsStridedParams<NUM_DIM, int64_t> params) {
  const int64_t* dest_dims = reinterpret_cast<const int64_t*>(params.dest_dims);
  const int32_t* stride = reinterpret_cast<const int32_t*>(params.stride);
  CUDA_1D_KERNEL_LOOP_T(int64_t, i, params.output_num) {
    int64_t dy_index[NUM_DIM];
    params.destIndexOffsetHelper.OffsetToNdIndex(i, dy_index, params.dest_num_dims);
    int32_t index_in_dx = params.storage_offset;
    FOR_RANGE(int64_t, j, 0, params.dest_num_dims) { index_in_dx += dy_index[j] * stride[j]; }
    cuda::atomic::Add(dx_buf + index_in_dx, dy_buf[i]);
  }
 }
 template<typename T>
 struct AsStridedFunctor final {
  void operator()(ep::Stream* stream, const T* input_buf, T* output_buf, const int64_t* dest_dims,
                  const int32_t* stride, const int32_t dest_num_dims, const int32_t storage_offset,
                  const int32_t input_num, const int32_t output_num) {
    NdIndexOffsetHelper<int64_t, NUM_DIM> destIndexOffsetHelper(dest_dims, dest_num_dims);
    AsStridedParams<NUM_DIM, int64_t> params;
    params.destIndexOffsetHelper = destIndexOffsetHelper;
    FOR_RANGE(size_t, i, 0, dest_num_dims) {
      params.dest_dims[i] = dest_dims[i];
      params.stride[i] = stride[i];
    }
    params.dest_num_dims = dest_num_dims;
    params.storage_offset = storage_offset;
    params.input_num = input_num;
    params.output_num = output_num;
    AsStrided_kernel<T>
        <<<BlocksNum4ThreadsNum(output_num), kCudaThreadsNumPerBlock, 0,
           stream->As<ep::CudaStream>()->cuda_stream()>>>(input_buf, output_buf, params);
  }
 };
 template<typename T>
 struct AsStridedGradFunctor final {
  void operator()(ep::Stream* stream, const T* dy_buf, T* dx_buf, const int64_t* dy_dims,
                  const int32_t* stride, const int32_t dy_num_dims, const int32_t storage_offset,
                  const int32_t dx_num, const int32_t dy_num) {
    NdIndexOffsetHelper<int64_t, NUM_DIM> dyIndexOffsetHelper(dy_dims, dy_num_dims);
    AsStridedParams<NUM_DIM, int64_t> params;
    params.destIndexOffsetHelper = dyIndexOffsetHelper;
    FOR_RANGE(size_t, i, 0, dy_num_dims) {
      params.dest_dims[i] = dy_dims[i];
      params.stride[i] = stride[i];
    }
    params.dest_num_dims = dy_num_dims;
    params.storage_offset = storage_offset;
    params.input_num = dx_num;
    params.output_num = dy_num;
    AsStridedGrad_kernel<T>
        <<<BlocksNum4ThreadsNum(dy_num), kCudaThreadsNumPerBlock, 0,
           stream->As<ep::CudaStream>()->cuda_stream()>>>(dy_buf, dx_buf, params);
  }
 };
 }  // namespace
 template<typename T>
 class GpuAsStridedKernel final : public user_op::OpKernel {
 public:
  GpuAsStridedKernel() = default;
  ~GpuAsStridedKernel() = default;
 private:
  using user_op::OpKernel::Compute;
  void Compute(user_op::KernelComputeContext* ctx) const override {
    const user_op::Tensor* input = ctx->Tensor4ArgNameAndIndex("input", 0);
    user_op::Tensor* output = ctx->Tensor4ArgNameAndIndex("output", 0);
    const auto size = ctx->Attr<std::vector<int32_t>>("size");
    const auto stride = ctx->Attr<std::vector<int32_t>>("stride");
    const int32_t storage_offset = ctx->Attr<int32_t>("storage_offset");
    size_t dest_num_dims = output->shape_view().NumAxes();
    const int64_t* dest_dims = output->shape_view().ptr();
    const size_t input_num = input->shape_view().Count(0);
    const size_t output_num = output->shape_view().Count(0);
    if (input_num == 0) {
      // 0-size tensor
      return;
    }
    AsStridedFunctor<T>()(ctx->stream(), input->dptr<T>(), output->mut_dptr<T>(), dest_dims,
                          stride.data(), dest_num_dims, storage_offset, input_num, output_num);
  }
  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
 };
 template<typename T>
 class GpuAsStridedGradKernel final : public user_op::OpKernel {
 public:
  GpuAsStridedGradKernel() = default;
  ~GpuAsStridedGradKernel() = default;
 private:
  using user_op::OpKernel::Compute;
  void Compute(user_op::KernelComputeContext* ctx) const override {
    const user_op::Tensor* dy = ctx->Tensor4ArgNameAndIndex("dy", 0);
    user_op::Tensor* dx = ctx->Tensor4ArgNameAndIndex("dx", 0);
    const auto size = ctx->Attr<std::vector<int32_t>>("size");
    const auto stride = ctx->Attr<std::vector<int32_t>>("stride");
    const int32_t storage_offset = ctx->Attr<int32_t>("storage_offset");
    size_t dy_num_dims = dy->shape_view().NumAxes();
    const int64_t* dy_dims = dy->shape_view().ptr();
    const size_t dx_num = dx->shape_view().Count(0);
    const size_t dy_num = dy->shape_view().Count(0);
    Memset<DeviceType::kCUDA>(ctx->stream(), dx->mut_dptr(), 0,
                              dx->shape_view().Count(0) * sizeof(T));
    AsStridedGradFunctor<T>()(ctx->stream(), dy->dptr<T>(), dx->mut_dptr<T>(), dy_dims,
                              stride.data(), dy_num_dims, storage_offset, dx_num, dy_num);
  }
  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
 };
 #define REGISTER_GPUASSTRIDED_KERNEL(in_type)                                                 \
  REGISTER_USER_KERNEL("as_strided")                                                          \
      .SetCreateFn<GpuAsStridedKernel<in_type>>()                                             \
      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA)                        \
                       && (user_op::HobDataType("input", 0) == GetDataType<in_type>::value)); \
  REGISTER_USER_KERNEL("as_strided_grad")                                                     \
      .SetCreateFn<GpuAsStridedGradKernel<in_type>>()                                         \
      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA)                        \
                       && (user_op::HobDataType("input", 0) == GetDataType<in_type>::value));
 REGISTER_GPUASSTRIDED_KERNEL(half);
 REGISTER_GPUASSTRIDED_KERNEL(float);
 REGISTER_GPUASSTRIDED_KERNEL(double);
 REGISTER_GPUASSTRIDED_KERNEL(int64_t);
 #undef REGISTER_GPUASSTRIDED_KERNEL
 }  // namespace oneflow
\ No newline at end of file
--- a/oneflow/user/kernels/assign_if_kernel.hip.cpp
+++ b/oneflow/user/kernels/assign_if_kernel.hip.cpp
 /*
 Copyright 2020 The OneFlow Authors. All rights reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 */
 #include "hip/hip_runtime.h"
 #include "oneflow/core/framework/framework.h"
 #include "oneflow/core/kernel/kernel_util.h"
 #include "oneflow/core/ep/rocm/cuda_stream.h"
 namespace oneflow {
 namespace {
 template<bool assign_if, typename C, typename T>
 __global__ void AssignGpu(int64_t elem_cnt, const C* condition, const T* value, T* ref) {
  if (assign_if == (*condition == 0)) { return; }
  CUDA_1D_KERNEL_LOOP(i, elem_cnt) { ref[i] = value[i]; }
 }
 template<bool assign_if, typename C, typename T>
 class AssignIfGPUKernel final : public user_op::OpKernel {
 public:
  AssignIfGPUKernel() = default;
  ~AssignIfGPUKernel() override = default;
 private:
  using user_op::OpKernel::Compute;
  void Compute(user_op::KernelComputeContext* ctx) const override {
    const user_op::Tensor* condition = ctx->Tensor4ArgNameAndIndex("condition", 0);
    CHECK_EQ(condition->shape_view().NumAxes(), 1);
    CHECK_EQ(condition->shape_view().At(0), 1);
    const user_op::Tensor* value = ctx->Tensor4ArgNameAndIndex("value", 0);
    user_op::Tensor* ref = ctx->Tensor4ArgNameAndIndex("ref", 0);
    if (value->dptr() == ref->dptr()) { return; }
    CHECK_EQ(value->shape_view(), ref->shape_view());
    CHECK_EQ(value->data_type(), ref->data_type());
    const size_t elem_cnt = ref->shape_view().elem_cnt();
    AssignGpu<assign_if, C, T><<<BlocksNum4ThreadsNum(elem_cnt), kCudaThreadsNumPerBlock, 0,
                                 ctx->stream()->As<ep::CudaStream>()->cuda_stream()>>>(
        elem_cnt, condition->dptr<C>(), value->dptr<T>(), ref->mut_dptr<T>());
  }
  bool AlwaysComputeWhenAllOutputsEmpty() const override { return true; }
 };
 }  // namespace
 #define REGISTER_ASSIGN_WITH_CONDITION_VALUE_CUDA_KERNEL(op_type_name, assign_if, condition_type, \
                                                         value_type)                              \
  REGISTER_USER_KERNEL(op_type_name)                                                              \
      .SetCreateFn<AssignIfGPUKernel<assign_if, condition_type, value_type>>()                    \
      .SetIsMatchedHob(                                                                           \
          (user_op::HobDeviceType() == DeviceType::kCUDA)                                         \
          && (user_op::HobDataType("condition", 0) == GetDataType<condition_type>::value)         \
          && (user_op::HobDataType("value", 0) == GetDataType<value_type>::value));
 #define REGISTER_ASSIGN_IF_CUDA_KERNEL(condition_type, value_type)                        \
  REGISTER_ASSIGN_WITH_CONDITION_VALUE_CUDA_KERNEL(                                       \
      "assign_if", true, OF_PP_PAIR_FIRST(condition_type), OF_PP_PAIR_FIRST(value_type)); \
  REGISTER_ASSIGN_WITH_CONDITION_VALUE_CUDA_KERNEL(                                       \
      "assign_if_not", false, OF_PP_PAIR_FIRST(condition_type), OF_PP_PAIR_FIRST(value_type))
 OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(REGISTER_ASSIGN_IF_CUDA_KERNEL, INT_DATA_TYPE_SEQ,
                                 POD_DATA_TYPE_SEQ)
 }  // namespace oneflow
\ No newline at end of file
--- a/oneflow/user/kernels/avg_pool_kernel.hip.cpp
+++ b/oneflow/user/kernels/avg_pool_kernel.hip.cpp
 /*
 Copyright 2020 The OneFlow Authors. All rights reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 */
 #include "hip/hip_runtime.h"
 #include <cstdint>
 #include "oneflow/core/hip/elementwise.hip.h"
 #include "oneflow/user/kernels/avg_pool_kernel_util.h"
 #include "oneflow/core/ep/rocm/cuda_stream.h"
 namespace oneflow {
 namespace {
 constexpr int kBlockSize = cuda::elementwise::kBlockSize;
 int GetMinThreadNum(const int64_t elem_num) { return std::min<int64_t>(elem_num, kBlockSize); }
 int GetNumBlocks(int32_t elem_cnt) {
  int num_blocks = 0;
  OF_CUDA_CHECK(cuda::elementwise::GetNumBlocks(elem_cnt, &num_blocks));
  return num_blocks;
 }
 }  // namespace
 template<typename T, typename IDX>
 __launch_bounds__(kBlockSize) __global__
    void DoCUDAAvgPool1dForward(const NdIndexOffsetHelper<IDX, 2> index_helper, IDX elem_num,
                                const T* src, T* dest, int32_t padding_l, const int32_t n_batch,
                                const int32_t n_channel, const int32_t x_length,
                                const int32_t kernel_size_l, const int32_t stride_l,
                                const bool count_include_pad, const int32_t divisor_override) {
  Avgpool1dForwardCompute<T>(index_helper, elem_num, src, dest, padding_l, n_batch, n_channel,
                             x_length, kernel_size_l, stride_l, count_include_pad,
                             divisor_override);
 };
 template<typename T, typename IDX>
 __launch_bounds__(kBlockSize) __global__
    void DoCUDAAvgPool2dForward(const NdIndexOffsetHelper<IDX, 3> index_helper, IDX elem_num,
                                const T* src, T* dest, const int32_t padding_h,
                                const int32_t padding_w, const int32_t n_batch,
                                const int32_t n_channel, const int32_t x_height,
                                const int32_t x_width, const int32_t kernel_size_h,
                                const int32_t kernel_size_w, const int32_t stride_h,
                                const int32_t stride_w, const bool count_include_pad,
                                const int32_t divisor_override) {
  Avgpool2dForwardCompute<T>(index_helper, elem_num, src, dest, padding_h, padding_w, n_batch,
                             n_channel, x_height, x_width, kernel_size_h, kernel_size_w, stride_h,
                             stride_w, count_include_pad, divisor_override);
 };
 template<typename T, typename IDX>
 __launch_bounds__(kBlockSize) __global__
    void DoCUDAAvgPool3dForward(const NdIndexOffsetHelper<IDX, 4> index_helper, IDX elem_num,
                                const T* src, T* dest, int32_t padding_t, const int32_t padding_h,
                                const int32_t padding_w, const int32_t n_batch,
                                const int32_t n_channel, const int32_t x_time,
                                const int32_t x_height, const int32_t x_width,
                                const int32_t kernel_size_t, int32_t kernel_size_h,
                                const int32_t kernel_size_w, const int32_t stride_t,
                                const int32_t stride_h, const int32_t stride_w,
                                const bool count_include_pad, const int32_t divisor_override) {
  Avgpool3dForwardCompute<T>(index_helper, elem_num, src, dest, padding_t, padding_h, padding_w,
                             n_batch, n_channel, x_time, x_height, x_width, kernel_size_t,
                             kernel_size_h, kernel_size_w, stride_t, stride_h, stride_w,
                             count_include_pad, divisor_override);
 };
 template<typename T, typename IDX>
 __launch_bounds__(kBlockSize) __global__
    void DoCUDAAvgPool1dBackward(const NdIndexOffsetHelper<IDX, 2> index_helper, IDX elem_num,
                                 const T* src, T* dest, const int32_t padding_l,
                                 const int32_t n_batch, const int32_t n_channel,
                                 const int32_t input_length, const int32_t kernel_size_l,
                                 const int32_t stride_l, const bool count_include_pad,
                                 const int32_t divisor_override) {
  Avgpool1dBackwardCompute<T>(index_helper, elem_num, src, dest, padding_l, n_batch, n_channel,
                              input_length, kernel_size_l, stride_l, count_include_pad,
                              divisor_override);
 };
 template<typename T, typename IDX>
 __launch_bounds__(kBlockSize) __global__
    void DoCUDAAvgPool2dBackward(const NdIndexOffsetHelper<IDX, 3> index_helper, IDX elem_num,
                                 const T* src, T* dest, const int32_t padding_h,
                                 const int32_t padding_w, const int32_t n_batch,
                                 const int32_t n_channel, const int32_t input_height,
                                 const int32_t input_width, const int32_t kernel_size_h,
                                 const int32_t kernel_size_w, const int32_t stride_h,
                                 const int32_t stride_w, const bool count_include_pad,
                                 int32_t divisor_override) {
  Avgpool2dBackwardCompute<T>(index_helper, elem_num, src, dest, padding_h, padding_w, n_batch,
                              n_channel, input_height, input_width, kernel_size_h, kernel_size_w,
                              stride_h, stride_w, count_include_pad, divisor_override);
 };
 template<typename T, typename IDX>
 __launch_bounds__(kBlockSize) __global__ void DoCUDAAvgPool3dBackward(
    const NdIndexOffsetHelper<IDX, 4> index_helper, IDX elem_num, const T* src, T* dest,
    const int32_t padding_t, const int32_t padding_h, const int32_t padding_w,
    const int32_t n_batch, const int32_t n_channel, const int32_t x_time, const int32_t x_height,
    const int32_t x_width, const int32_t kernel_size_t, const int32_t kernel_size_h,
    const int32_t kernel_size_w, const int32_t stride_t, const int32_t stride_h,
    const int32_t stride_w, const bool count_include_pad, const int32_t divisor_override) {
  Avgpool3dBackwardCompute<T>(index_helper, elem_num, src, dest, padding_t, padding_h, padding_w,
                              n_batch, n_channel, x_time, x_height, x_width, kernel_size_t,
                              kernel_size_h, kernel_size_w, stride_t, stride_h, stride_w,
                              count_include_pad, divisor_override);
 };
 template<typename T, typename IDX>
 struct AvgPoolKernelUtil<DeviceType::kCUDA, T, IDX> {
  static void Avgpool1dForward(ep::Stream* stream, const NdIndexOffsetHelper<IDX, 2>& index_helper,
                               const IDX elem_num, const T* src, T* dest,
                               const AvgPoolParams3D& params_3d) {
    DoCUDAAvgPool1dForward<T, IDX><<<GetNumBlocks(elem_num), GetMinThreadNum(elem_num), 0,
                                     stream->As<ep::CudaStream>()->cuda_stream()>>>(
        index_helper, elem_num, src, dest, params_3d.padding()[2], params_3d.num_batch(),
        params_3d.num_channel(), params_3d.GetXShape5D().At(4), params_3d.pool_size_3d()[2],
        params_3d.stride_3d()[2], params_3d.count_include_pad(), params_3d.divisor_override());
  }
  static void Avgpool1dBackward(ep::Stream* stream, const NdIndexOffsetHelper<IDX, 2>& index_helper,
                                const IDX elem_num, const T* src, T* dest,
                                const AvgPoolParams3D& params_3d) {
    DoCUDAAvgPool1dBackward<T, IDX><<<GetNumBlocks(elem_num), GetMinThreadNum(elem_num), 0,
                                      stream->As<ep::CudaStream>()->cuda_stream()>>>(
        index_helper, elem_num, src, dest, params_3d.padding()[2], params_3d.num_batch(),
        params_3d.num_channel(), params_3d.GetXShape5D().At(4), params_3d.pool_size_3d()[2],
        params_3d.stride_3d()[2], params_3d.count_include_pad(), params_3d.divisor_override());
  }
  static void Avgpool2dForward(ep::Stream* stream, const NdIndexOffsetHelper<IDX, 3>& index_helper,
                               const IDX elem_num, const T* src, T* dest,
                               const AvgPoolParams3D& params_3d) {
    DoCUDAAvgPool2dForward<T, IDX><<<GetNumBlocks(elem_num), GetMinThreadNum(elem_num), 0,
                                     stream->As<ep::CudaStream>()->cuda_stream()>>>(
        index_helper, elem_num, src, dest, params_3d.padding()[1], params_3d.padding()[2],
        params_3d.num_batch(), params_3d.num_channel(), params_3d.GetXShape5D().At(3),
        params_3d.GetXShape5D().At(4), params_3d.pool_size_3d()[1], params_3d.pool_size_3d()[2],
        params_3d.stride_3d()[1], params_3d.stride_3d()[2], params_3d.count_include_pad(),
        params_3d.divisor_override());
  }
  static void Avgpool2dBackward(ep::Stream* stream, const NdIndexOffsetHelper<IDX, 3>& index_helper,
                                const IDX elem_num, const T* src, T* dest,
                                const AvgPoolParams3D& params_3d) {
    DoCUDAAvgPool2dBackward<T, IDX><<<GetNumBlocks(elem_num), GetMinThreadNum(elem_num), 0,
                                      stream->As<ep::CudaStream>()->cuda_stream()>>>(
        index_helper, elem_num, src, dest, params_3d.padding()[1], params_3d.padding()[2],
        params_3d.num_batch(), params_3d.num_channel(), params_3d.GetXShape5D().At(3),
        params_3d.GetXShape5D().At(4), params_3d.pool_size_3d()[1], params_3d.pool_size_3d()[2],
        params_3d.stride_3d()[1], params_3d.stride_3d()[2], params_3d.count_include_pad(),
        params_3d.divisor_override());
  }
  static void Avgpool3dForward(ep::Stream* stream, const NdIndexOffsetHelper<IDX, 4>& index_helper,
                               const IDX elem_num, const T* src, T* dest,
                               const AvgPoolParams3D& params_3d) {
    DoCUDAAvgPool3dForward<T, IDX><<<GetNumBlocks(elem_num), GetMinThreadNum(elem_num), 0,
                                     stream->As<ep::CudaStream>()->cuda_stream()>>>(
        index_helper, elem_num, src, dest, params_3d.padding()[0], params_3d.padding()[1],
        params_3d.padding()[2], params_3d.num_batch(), params_3d.num_channel(),
        params_3d.GetXShape5D().At(2), params_3d.GetXShape5D().At(3), params_3d.GetXShape5D().At(4),
        params_3d.pool_size_3d()[0], params_3d.pool_size_3d()[1], params_3d.pool_size_3d()[2],
        params_3d.stride_3d()[0], params_3d.stride_3d()[1], params_3d.stride_3d()[2],
        params_3d.count_include_pad(), params_3d.divisor_override());
  }
  static void Avgpool3dBackward(ep::Stream* stream, const NdIndexOffsetHelper<IDX, 4>& index_helper,
                                const IDX elem_num, const T* src, T* dest,
                                const AvgPoolParams3D& params_3d) {
    DoCUDAAvgPool3dBackward<T, IDX><<<GetNumBlocks(elem_num), GetMinThreadNum(elem_num), 0,
                                      stream->As<ep::CudaStream>()->cuda_stream()>>>(
        index_helper, elem_num, src, dest, params_3d.padding()[0], params_3d.padding()[1],
        params_3d.padding()[2], params_3d.num_batch(), params_3d.num_channel(),
        params_3d.GetXShape5D().At(2), params_3d.GetXShape5D().At(3), params_3d.GetXShape5D().At(4),
        params_3d.pool_size_3d()[0], params_3d.pool_size_3d()[1], params_3d.pool_size_3d()[2],
        params_3d.stride_3d()[0], params_3d.stride_3d()[1], params_3d.stride_3d()[2],
        params_3d.count_include_pad(), params_3d.divisor_override());
  }
 };
 OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INSTANTIATE_AVG_POOL_KERNEL_UTIL, (DeviceType::kCUDA),
                                 AVG_POOL_DATA_TYPE_CUDA_SEQ, AVG_POOL_IDX_DATA_TYPE_SEQ);
 }  // namespace oneflow
\ No newline at end of file
--- a/oneflow/user/kernels/batch_gather_kernel_util.hip.cpp
+++ b/oneflow/user/kernels/batch_gather_kernel_util.hip.cpp
 /*
 Copyright 2020 The OneFlow Authors. All rights reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 */
 #include "hip/hip_runtime.h"
 #include "oneflow/user/kernels/batch_gather_kernel_util.h"
 #include "oneflow/core/hip/atomic.hip.h"
 #include "oneflow/core/ep/rocm/cuda_stream.h"
 #include <assert.h>
 namespace oneflow {
 namespace {
 template<typename K>
 __device__ int64_t GetInOffset(const int64_t out_offset, const K* indices,
                               const int64_t indices_num, const int64_t instance_size,
                               const int64_t gather_dim_size) {
  const int64_t batch_idx = out_offset / (indices_num * instance_size);
  const int64_t indices_idx = out_offset % (indices_num * instance_size) / instance_size;
  const int64_t inner_idx = out_offset % instance_size;
  const int64_t idx = indices[batch_idx * indices_num + indices_idx];
  assert(idx >= 0 && idx < gather_dim_size);
  return batch_idx * gather_dim_size * instance_size + idx * instance_size + inner_idx;
 }
 template<typename T, typename K>
 __global__ void BatchGatherForwardGpu(const int64_t elem_cnt, const T* in, const K* indices,
                                      const int64_t indices_num, const int64_t instance_size,
                                      const int64_t gather_dim_size, T* out) {
  CUDA_1D_KERNEL_LOOP(i, elem_cnt) {
    out[i] = in[GetInOffset<K>(i, indices, indices_num, instance_size, gather_dim_size)];
  }
 }
 template<typename T, typename K>
 __global__ void BatchGatherBackwardGpu(const int64_t elem_cnt, const T* out_diff, const K* indices,
                                       const int64_t indices_num, const int64_t instance_size,
                                       const int64_t gather_dim_size, T* in_diff) {
  CUDA_1D_KERNEL_LOOP(i, elem_cnt) {
    cuda::atomic::Add(
        in_diff + GetInOffset<K>(i, indices, indices_num, instance_size, gather_dim_size),
        out_diff[i]);
  }
 }
 }  // namespace
 template<typename T, typename K>
 struct BatchGatherKernelUtilImpl<DeviceType::kCUDA, T, K> final {
  static void Forward(ep::Stream* stream, const T* in, const K* indices,
                      const Shape& flat_out_shape, const int64_t gather_dim_size, T* out);
  static void Backward(ep::Stream* stream, const T* out_diff, const K* indices,
                       const Shape& flat_out_diff_shape, const int64_t gather_dim_size, T* in_diff);
 };
 template<typename T, typename K>
 void BatchGatherKernelUtilImpl<DeviceType::kCUDA, T, K>::Forward(ep::Stream* stream, const T* in,
                                                                 const K* indices,
                                                                 const Shape& flat_out_shape,
                                                                 const int64_t gather_dim_size,
                                                                 T* out) {
  const int64_t batch_num = flat_out_shape.At(0);
  const int64_t indices_num = flat_out_shape.At(1);
  const int64_t instance_size = flat_out_shape.At(2);
  const int64_t elem_cnt = batch_num * indices_num * instance_size;
  BatchGatherForwardGpu<T, K><<<BlocksNum4ThreadsNum(elem_cnt), kCudaThreadsNumPerBlock, 0,
                                stream->As<ep::CudaStream>()->cuda_stream()>>>(
      elem_cnt, in, indices, indices_num, instance_size, gather_dim_size, out);
 }
 template<typename T, typename K>
 void BatchGatherKernelUtilImpl<DeviceType::kCUDA, T, K>::Backward(
    ep::Stream* stream, const T* out_diff, const K* indices, const Shape& flat_out_diff_shape,
    const int64_t gather_dim_size, T* in_diff) {
  const int64_t batch_num = flat_out_diff_shape.At(0);
  const int64_t indices_num = flat_out_diff_shape.At(1);
  const int64_t instance_size = flat_out_diff_shape.At(2);
  const int64_t elem_cnt = batch_num * indices_num * instance_size;
  BatchGatherBackwardGpu<T, K><<<BlocksNum4ThreadsNum(elem_cnt), kCudaThreadsNumPerBlock, 0,
                                 stream->As<ep::CudaStream>()->cuda_stream()>>>(
      elem_cnt, out_diff, indices, indices_num, instance_size, gather_dim_size, in_diff);
 }
 #define INSTANTIATE_BATCH_GATHER_KERNEL_UTIL_IMPL_CUDA(in_type_pair, index_type_pair)          \
  template struct BatchGatherKernelUtilImpl<DeviceType::kCUDA, OF_PP_PAIR_FIRST(in_type_pair), \
                                            OF_PP_PAIR_FIRST(index_type_pair)>;
 OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INSTANTIATE_BATCH_GATHER_KERNEL_UTIL_IMPL_CUDA,
                                 FLOATING_DATA_TYPE_SEQ, INT_DATA_TYPE_SEQ);
 #undef INSTANTIATE_BATCH_GATHER_KERNEL_UTIL_IMPL_CUDA
 }  // namespace oneflow
\ No newline at end of file
--- a/oneflow/user/kernels/binary_cross_entropy_kernel.hip.cpp
+++ b/oneflow/user/kernels/binary_cross_entropy_kernel.hip.cpp
 /*
 Copyright 2020 The OneFlow Authors. All rights reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 */
 #include "oneflow/core/framework/framework.h"
 #include "oneflow/core/hip/elementwise.hip.h"
 #include "oneflow/user/kernels/loss_kernel_util.h"
 #include "oneflow/core/ep/rocm/cuda_stream.h"
 namespace oneflow {
 namespace user_op {
 namespace {
 using namespace loss;
 template<typename T>
 struct BinaryCrossEntropyFunctor {
  T zero_;
  T one_;
  T negative_hundred_;
  BinaryCrossEntropyFunctor()
      : zero_(GetZeroVal<T>()), one_(GetOneVal<T>()), negative_hundred_(static_cast<T>(-100)) {}
  __device__ __forceinline__ T operator()(T input_val, T target_val) const {
    assert(input_val >= zero_);
    assert(input_val <= one_);
    return (target_val - one_) * max(static_cast<T>(log(one_ - input_val)), negative_hundred_)
           - target_val * max(static_cast<T>(log(input_val)), negative_hundred_);
  }
  __device__ __forceinline__ T operator()(T input_val, T target_val, T weight_val) const {
    return (*this)(input_val, target_val) * weight_val;
  }
 };
 template<>
 struct BinaryCrossEntropyFunctor<float> {
  float zero_;
  float one_;
  float negative_hundred_;
  BinaryCrossEntropyFunctor() : zero_(0.f), one_(1.f), negative_hundred_(-100.f) {}
  __device__ __forceinline__ float operator()(float input_val, float target_val) const {
    assert(input_val >= zero_);
    assert(input_val <= one_);
    return (target_val - one_) * max(logf(one_ - input_val), negative_hundred_)
           - target_val * max(logf(input_val), negative_hundred_);
  }
  __device__ __forceinline__ float operator()(float input_val, float target_val,
                                              float weight_val) const {
    return (*this)(input_val, target_val) * weight_val;
  }
 };
 template<>
 struct BinaryCrossEntropyFunctor<half> {
  BinaryCrossEntropyFunctor<float> float_functor;
  __device__ __forceinline__ half operator()(half input_val, half target_val) const {
    return __float2half(float_functor(__half2float(input_val), __half2float(target_val)));
  }
  __device__ __forceinline__ half operator()(half input_val, half target_val,
                                             half weight_val) const {
    return (*this)(input_val, target_val) * weight_val;
  }
 };
 template<typename T>
 struct BinaryCrossEntropyGradFunctor {
  T eps_;
  T one_;
  BinaryCrossEntropyGradFunctor() : eps_(static_cast<T>(1e-12)), one_(GetOneVal<T>()) {}
  __device__ __forceinline__ T operator()(T input_val, T target_val, T dy_val) const {
    return dy_val * (input_val - target_val) / max((one_ - input_val) * input_val, eps_);
  }
  __device__ __forceinline__ T operator()(T input_val, T target_val, T dy_val, T weight_val) const {
    return (*this)(input_val, target_val, dy_val) * weight_val;
  }
 };
 template<>
 struct BinaryCrossEntropyGradFunctor<half> {
  BinaryCrossEntropyGradFunctor<float> float_functor;
  BinaryCrossEntropyGradFunctor() {}
  __device__ __forceinline__ half operator()(half input_val, half target_val, half dy_val) const {
    return __float2half(
        float_functor(__half2float(input_val), __half2float(target_val), __half2float(dy_val)));
  }
  __device__ __forceinline__ half operator()(half input_val, half target_val, half dy_val,
                                             half weight_val) const {
    return __float2half(float_functor(__half2float(input_val), __half2float(target_val),
                                      __half2float(dy_val), __half2float(weight_val)));
  }
 };
 template<typename T>
 class BinaryCrossEntropyKernel final : public user_op::OpKernel {
 public:
  BinaryCrossEntropyKernel() = default;
  ~BinaryCrossEntropyKernel() = default;
 private:
  using user_op::OpKernel::Compute;
  void Compute(user_op::KernelComputeContext* ctx) const override {
    const auto* input_blob = ctx->Tensor4ArgNameAndIndex("input", 0);
    const auto* target_blob = ctx->Tensor4ArgNameAndIndex("target", 0);
    auto* out_blob = ctx->Tensor4ArgNameAndIndex("out", 0);
    const int64_t elem_cnt = input_blob->shape_view().elem_cnt();
    const T* input = input_blob->dptr<T>();
    const T* target = target_blob->dptr<T>();
    T* out = out_blob->mut_dptr<T>();
    if (ctx->has_input("weight", 0)) {
      const T* weight = ctx->Tensor4ArgNameAndIndex("weight", 0)->dptr<T>();
      OF_CUDA_CHECK(
          (cuda::elementwise::Ternary(BinaryCrossEntropyFunctor<T>(), elem_cnt, out, input, target,
                                      weight, ctx->stream()->As<ep::CudaStream>()->cuda_stream())));
    } else {
      OF_CUDA_CHECK(
          (cuda::elementwise::Binary(BinaryCrossEntropyFunctor<T>(), elem_cnt, out, input, target,
                                     ctx->stream()->As<ep::CudaStream>()->cuda_stream())));
    }
  }
  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
 };
 template<typename T>
 class BinaryCrossEntropyGradKernel final : public user_op::OpKernel {
 public:
  BinaryCrossEntropyGradKernel() = default;
  ~BinaryCrossEntropyGradKernel() = default;
 private:
  using user_op::OpKernel::Compute;
  void Compute(user_op::KernelComputeContext* ctx) const override {
    const auto* input_blob = ctx->Tensor4ArgNameAndIndex("input", 0);
    const auto* target_blob = ctx->Tensor4ArgNameAndIndex("target", 0);
    const auto* dy_blob = ctx->Tensor4ArgNameAndIndex("dy", 0);
    auto* dx_blob = ctx->Tensor4ArgNameAndIndex("dx", 0);
    const int64_t elem_cnt = input_blob->shape_view().elem_cnt();
    const T* dy = dy_blob->dptr<T>();
    const T* input = input_blob->dptr<T>();
    const T* target = target_blob->dptr<T>();
    T* dx = dx_blob->mut_dptr<T>();
    if (ctx->has_input("weight", 0)) {
      const T* weight = ctx->Tensor4ArgNameAndIndex("weight", 0)->dptr<T>();
      using FunctorT = BinaryCrossEntropyGradFunctor<T>;
      using FactoryT = cuda::elementwise::SimpleFactory<FunctorT>;
      OF_CUDA_CHECK((cuda::elementwise::GenericLauncher<FactoryT, T, T, T, T, T>::Launch(
          FactoryT(FunctorT()), elem_cnt, dx, input, target, dy, weight,
          ctx->stream()->As<ep::CudaStream>()->cuda_stream())));
    } else {
      OF_CUDA_CHECK((cuda::elementwise::Ternary(
          BinaryCrossEntropyGradFunctor<T>(), elem_cnt, dx, input, target, dy,
          ctx->stream()->As<ep::CudaStream>()->cuda_stream())));
    }
  }
  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
 };
 }  // namespace
 #define REGISTER_BINARY_CROSS_ENTROPY_KERNEL(dtype)                                        \
  REGISTER_USER_KERNEL("binary_cross_entropy")                                             \
      .SetCreateFn<BinaryCrossEntropyKernel<dtype>>()                                      \
      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA)                     \
                       && (user_op::HobDataType("input", 0) == GetDataType<dtype>::value)  \
                       && (user_op::HobDataType("target", 0) == GetDataType<dtype>::value) \
                       && (user_op::HobDataType("out", 0) == GetDataType<dtype>::value));
 #define REGISTER_BINARY_CROSS_ENTROPY_GRAD_KERNEL(dtype)                                   \
  REGISTER_USER_KERNEL("binary_cross_entropy_grad")                                        \
      .SetCreateFn<BinaryCrossEntropyGradKernel<dtype>>()                                  \
      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA)                     \
                       && (user_op::HobDataType("input", 0) == GetDataType<dtype>::value)  \
                       && (user_op::HobDataType("target", 0) == GetDataType<dtype>::value) \
                       && (user_op::HobDataType("dy", 0) == GetDataType<dtype>::value)     \
                       && (user_op::HobDataType("dx", 0) == GetDataType<dtype>::value));
 REGISTER_BINARY_CROSS_ENTROPY_KERNEL(half)
 REGISTER_BINARY_CROSS_ENTROPY_KERNEL(float)
 REGISTER_BINARY_CROSS_ENTROPY_KERNEL(double)
 REGISTER_BINARY_CROSS_ENTROPY_GRAD_KERNEL(half)
 REGISTER_BINARY_CROSS_ENTROPY_GRAD_KERNEL(float)
 REGISTER_BINARY_CROSS_ENTROPY_GRAD_KERNEL(double)
 }  // namespace user_op
 }  // namespace oneflow
\ No newline at end of file
--- a/oneflow/user/kernels/binary_cross_entropy_with_logits_kernel.hip.cpp
+++ b/oneflow/user/kernels/binary_cross_entropy_with_logits_kernel.hip.cpp
 /*
 Copyright 2020 The OneFlow Authors. All rights reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 */
 #include "oneflow/core/framework/framework.h"
 #include "oneflow/core/hip/elementwise.hip.h"
 #include "oneflow/core/ndarray/ndarray_util.h"
 #include "oneflow/core/ndarray/xpu_var_ndarray.h"
 #include "oneflow/user/kernels/loss_kernel_util.h"
 #include "oneflow/core/ep/rocm/cuda_stream.h"
 namespace oneflow {
 namespace user_op {
 namespace {
 using namespace loss;
 enum class WeightType {
  kNone,
  kWeight,
  kPosWeight,
  kBoth,
 };
 template<typename T, WeightType WEIGHT_TYPE>
 struct BinaryCrossEntropyWithLogitsFunctor;
 template<typename T>
 struct BinaryCrossEntropyWithLogitsFunctor<T, WeightType::kNone> {
  T zero_;
  T one_;
  BinaryCrossEntropyWithLogitsFunctor() : zero_(GetZeroVal<T>()), one_(GetOneVal<T>()) {}
  __device__ __forceinline__ T operator()(T input_val, T target_val) const {
    const T max_val = -input_val < zero_ ? zero_ : -input_val;
    return (one_ - target_val) * input_val + max_val
           + (log(exp(-max_val) + exp(-input_val - max_val)));
  }
 };
 template<typename T>
 struct BinaryCrossEntropyWithLogitsFunctor<T, WeightType::kPosWeight> {
  T zero_;
  T one_;
  BinaryCrossEntropyWithLogitsFunctor() : zero_(GetZeroVal<T>()), one_(GetOneVal<T>()) {}
  __device__ __forceinline__ T operator()(T input_val, T target_val, T weight_val) const {
    const T max_val = -input_val < zero_ ? zero_ : -input_val;
    const T pos_weight_processed_val = weight_val - target_val + one_;
    return (one_ - target_val) * input_val
           + (pos_weight_processed_val
              * (log(exp(-max_val) + exp(-input_val - max_val)) + max_val));
  }
 };
 template<>
 struct BinaryCrossEntropyWithLogitsFunctor<float, WeightType::kNone> {
  float zero_;
  float one_;
  BinaryCrossEntropyWithLogitsFunctor() : zero_(0.f), one_(1.f) {}
  __device__ __forceinline__ float operator()(float input_val, float target_val) const {
    const float max_val = -input_val < zero_ ? zero_ : -input_val;
    return (one_ - target_val) * input_val + max_val
           + (logf(expf(-max_val) + expf(-input_val - max_val)));
  }
 };
 template<>
 struct BinaryCrossEntropyWithLogitsFunctor<float, WeightType::kPosWeight> {
  float zero_;
  float one_;
  BinaryCrossEntropyWithLogitsFunctor() : zero_(0.f), one_(1.f) {}
  __device__ __forceinline__ float operator()(float input_val, float target_val,
                                              float weight_val) const {
    const float max_val = -input_val < zero_ ? zero_ : -input_val;
    const float pos_weight_processed_val = weight_val - target_val + one_;
    return (one_ - target_val) * input_val
           + (pos_weight_processed_val
              * (logf(expf(-max_val) + expf(-input_val - max_val)) + max_val));
  }
 };
 template<typename T>
 struct BinaryCrossEntropyWithLogitsFunctor<T, WeightType::kWeight> {
  BinaryCrossEntropyWithLogitsFunctor<T, WeightType::kNone> f;
  __device__ __forceinline__ T operator()(T input_val, T target_val, T weight_val) const {
    return f(input_val, target_val) * weight_val;
  }
 };
 template<typename T>
 struct BinaryCrossEntropyWithLogitsFunctor<T, WeightType::kBoth> {
  BinaryCrossEntropyWithLogitsFunctor<T, WeightType::kPosWeight> f;
  __device__ __forceinline__ T operator()(T input_val, T target_val, T weight_val,
                                          T pos_weight_val) const {
    return f(input_val, target_val, pos_weight_val) * weight_val;
  }
 };
 template<>
 struct BinaryCrossEntropyWithLogitsFunctor<half, WeightType::kNone> {
  BinaryCrossEntropyWithLogitsFunctor<float, WeightType::kNone> f;
  __device__ __forceinline__ half operator()(half input_val, half target_val) const {
    return __float2half(f(__half2float(input_val), __half2float(target_val)));
  }
 };
 template<>
 struct BinaryCrossEntropyWithLogitsFunctor<half, WeightType::kPosWeight> {
  BinaryCrossEntropyWithLogitsFunctor<float, WeightType::kPosWeight> f;
  __device__ __forceinline__ half operator()(half input_val, half target_val,
                                             half weight_val) const {
    return __float2half(
        f(__half2float(input_val), __half2float(target_val), __half2float(weight_val)));
  }
 };
 template<>
 struct BinaryCrossEntropyWithLogitsFunctor<half, WeightType::kWeight> {
  BinaryCrossEntropyWithLogitsFunctor<float, WeightType::kWeight> f;
  __device__ __forceinline__ half operator()(half input_val, half target_val,
                                             half weight_val) const {
    return __float2half(
        f(__half2float(input_val), __half2float(target_val), __half2float(weight_val)));
  }
 };
 template<>
 struct BinaryCrossEntropyWithLogitsFunctor<half, WeightType::kBoth> {
  BinaryCrossEntropyWithLogitsFunctor<float, WeightType::kBoth> f;
  __device__ __forceinline__ half operator()(half input_val, half target_val, half weight_val,
                                             half pos_weight_val) const {
    return __float2half(f(__half2float(input_val), __half2float(target_val),
                          __half2float(weight_val), __half2float(pos_weight_val)));
  }
 };
 template<typename T>
 __device__ __forceinline__ T CalSigmoid(const T x) {
  const T half_of_one = static_cast<T>(0.5);
  return half_of_one * tanh(half_of_one * x) + half_of_one;
 }
 template<>
 __device__ __forceinline__ float CalSigmoid(const float x) {
  const float half_of_one = static_cast<float>(0.5);
  return half_of_one * tanhf(half_of_one * x) + half_of_one;
 }
 template<>
 __device__ __forceinline__ half CalSigmoid(const half x) {
  return __float2half(CalSigmoid(__half2float(x)));
 }
 template<typename T, WeightType WEIGHT_TYPE>
 struct BinaryCrossEntropyWithLogitsGradFunctor;
 template<typename T>
 struct BinaryCrossEntropyWithLogitsGradFunctor<T, WeightType::kNone> {
  __device__ __forceinline__ T operator()(T input_val, T target_val, T dy_val) const {
    return (CalSigmoid(input_val) - target_val) * dy_val;
  }
 };
 template<typename T>
 struct BinaryCrossEntropyWithLogitsGradFunctor<T, WeightType::kPosWeight> {
  T one_;
  BinaryCrossEntropyWithLogitsGradFunctor() : one_(GetOneVal<T>()) {}
  __device__ __forceinline__ T operator()(T input_val, T target_val, T dy_val, T weight_val) const {
    return dy_val * ((weight_val + one_ - target_val) * CalSigmoid(input_val) - weight_val);
  }
 };
 template<typename T>
 struct BinaryCrossEntropyWithLogitsGradFunctor<T, WeightType::kWeight> {
  BinaryCrossEntropyWithLogitsGradFunctor<T, WeightType::kNone> f;
  __device__ __forceinline__ T operator()(T input_val, T target_val, T dy_val, T weight_val) const {
    return f(input_val, target_val, dy_val) * weight_val;
  }
 };
 template<typename T>
 struct BinaryCrossEntropyWithLogitsGradFunctor<T, WeightType::kBoth> {
  BinaryCrossEntropyWithLogitsGradFunctor<T, WeightType::kPosWeight> f;
  __device__ __forceinline__ T operator()(T input_val, T target_val, T dy_val, T weight_val,
                                          T pos_weight_val) const {
    return f(input_val, target_val, dy_val, pos_weight_val) * weight_val;
  }
 };
 template<typename T>
 class BinaryCrossEntropyWithLogitsKernel final : public user_op::OpKernel {
 public:
  BinaryCrossEntropyWithLogitsKernel() = default;
  ~BinaryCrossEntropyWithLogitsKernel() override = default;
 private:
  using user_op::OpKernel::Compute;
  void Compute(user_op::KernelComputeContext* ctx) const override {
    const auto* input_blob = ctx->Tensor4ArgNameAndIndex("input", 0);
    const auto* target_blob = ctx->Tensor4ArgNameAndIndex("target", 0);
    auto* out_blob = ctx->Tensor4ArgNameAndIndex("out", 0);
    auto* tmp_buffer_blob = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0);
    const int64_t elem_cnt = input_blob->shape_view().elem_cnt();
    const T* input = input_blob->dptr<T>();
    const T* target = target_blob->dptr<T>();
    T* out = out_blob->mut_dptr<T>();
    if (ctx->Attr<bool>("has_pos_weight")) {
      T* pos_weight_processed = tmp_buffer_blob->mut_dptr<T>();
      const T* pos_weight = ctx->Tensor4ArgNameAndIndex("pos_weight", 0)->dptr<T>();
      Shape pos_weight_shape = Shape::Ones(target_blob->shape_view().NumAxes());
      pos_weight_shape.Set(pos_weight_shape.NumAxes() - 1,
                           ctx->Tensor4ArgNameAndIndex("pos_weight", 0)->shape_view().elem_cnt());
      NdarrayUtil<DeviceType::kCUDA, T>::BroadcastMul(
          ctx->stream(), XpuVarNdarray<T>(target_blob->shape_view(), pos_weight_processed),
          XpuVarNdarray<const T>(pos_weight_shape, pos_weight),
          XpuVarNdarray<const T>(target_blob->shape_view(), target));
      if (ctx->has_input("weight", 0)) {
        const T* weight = ctx->Tensor4ArgNameAndIndex("weight", 0)->dptr<T>();
        using FunctorT = BinaryCrossEntropyWithLogitsFunctor<T, WeightType::kBoth>;
        using FactoryT = cuda::elementwise::SimpleFactory<FunctorT>;
        OF_CUDA_CHECK((cuda::elementwise::GenericLauncher<FactoryT, T, T, T, T, T>::Launch(
            FactoryT(FunctorT()), elem_cnt, out, input, target, weight, pos_weight_processed,
            ctx->stream()->As<ep::CudaStream>()->cuda_stream())));
      } else {
        OF_CUDA_CHECK((cuda::elementwise::Ternary(
            BinaryCrossEntropyWithLogitsFunctor<T, WeightType::kPosWeight>(), elem_cnt, out, input,
            target, pos_weight_processed, ctx->stream()->As<ep::CudaStream>()->cuda_stream())));
      }
    } else {
      if (ctx->has_input("weight", 0)) {
        const T* weight = ctx->Tensor4ArgNameAndIndex("weight", 0)->dptr<T>();
        OF_CUDA_CHECK((cuda::elementwise::Ternary(
            BinaryCrossEntropyWithLogitsFunctor<T, WeightType::kWeight>(), elem_cnt, out, input,
            target, weight, ctx->stream()->As<ep::CudaStream>()->cuda_stream())));
      } else {
        OF_CUDA_CHECK((cuda::elementwise::Binary(
            BinaryCrossEntropyWithLogitsFunctor<T, WeightType::kNone>(), elem_cnt, out, input,
            target, ctx->stream()->As<ep::CudaStream>()->cuda_stream())));
      }
    }
  }
  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
 };
 template<typename T>
 class BinaryCrossEntropyWithLogitsGradKernel final : public user_op::OpKernel {
 public:
  BinaryCrossEntropyWithLogitsGradKernel() = default;
  ~BinaryCrossEntropyWithLogitsGradKernel() = default;
 private:
  using user_op::OpKernel::Compute;
  void Compute(user_op::KernelComputeContext* ctx) const override {
    const auto* input_blob = ctx->Tensor4ArgNameAndIndex("input", 0);
    const auto* target_blob = ctx->Tensor4ArgNameAndIndex("target", 0);
    const auto* dy_blob = ctx->Tensor4ArgNameAndIndex("dy", 0);
    auto* dx_blob = ctx->Tensor4ArgNameAndIndex("dx", 0);
    auto* tmp_buffer_blob = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0);
    const int64_t elem_cnt = input_blob->shape_view().elem_cnt();
    const T* dy = dy_blob->dptr<T>();
    const T* input = input_blob->dptr<T>();
    const T* target = target_blob->dptr<T>();
    T* dx = dx_blob->mut_dptr<T>();
    if (ctx->Attr<bool>("has_pos_weight")) {
      T* pos_weight_processed = tmp_buffer_blob->mut_dptr<T>();
      const T* pos_weight = ctx->Tensor4ArgNameAndIndex("pos_weight", 0)->dptr<T>();
      Shape pos_weight_shape = Shape::Ones(target_blob->shape_view().NumAxes());
      pos_weight_shape.Set(pos_weight_shape.NumAxes() - 1,
                           ctx->Tensor4ArgNameAndIndex("pos_weight", 0)->shape_view().elem_cnt());
      NdarrayUtil<DeviceType::kCUDA, T>::BroadcastMul(
          ctx->stream(), XpuVarNdarray<T>(target_blob->shape_view(), pos_weight_processed),
          XpuVarNdarray<const T>(pos_weight_shape, pos_weight),
          XpuVarNdarray<const T>(target_blob->shape_view(), target));
      if (ctx->has_input("weight", 0)) {
        const T* weight = ctx->Tensor4ArgNameAndIndex("weight", 0)->dptr<T>();
        using FunctorT = BinaryCrossEntropyWithLogitsGradFunctor<T, WeightType::kBoth>;
        using FactoryT = cuda::elementwise::SimpleFactory<FunctorT>;
        OF_CUDA_CHECK((cuda::elementwise::GenericLauncher<FactoryT, T, T, T, T, T, T>::Launch(
            FactoryT(FunctorT()), elem_cnt, dx, input, target, dy, weight, pos_weight_processed,
            ctx->stream()->As<ep::CudaStream>()->cuda_stream())));
      } else {
        using FunctorT = BinaryCrossEntropyWithLogitsGradFunctor<T, WeightType::kPosWeight>;
        using FactoryT = cuda::elementwise::SimpleFactory<FunctorT>;
        OF_CUDA_CHECK((cuda::elementwise::GenericLauncher<FactoryT, T, T, T, T, T>::Launch(
            FactoryT(FunctorT()), elem_cnt, dx, input, target, dy, pos_weight_processed,
            ctx->stream()->As<ep::CudaStream>()->cuda_stream())));
      }
    } else {
      if (ctx->has_input("weight", 0)) {
        const T* weight = ctx->Tensor4ArgNameAndIndex("weight", 0)->dptr<T>();
        using FunctorT = BinaryCrossEntropyWithLogitsGradFunctor<T, WeightType::kWeight>;
        using FactoryT = cuda::elementwise::SimpleFactory<FunctorT>;
        OF_CUDA_CHECK((cuda::elementwise::GenericLauncher<FactoryT, T, T, T, T, T>::Launch(
            FactoryT(FunctorT()), elem_cnt, dx, input, target, dy, weight,
            ctx->stream()->As<ep::CudaStream>()->cuda_stream())));
      } else {
        OF_CUDA_CHECK((cuda::elementwise::Ternary(
            BinaryCrossEntropyWithLogitsGradFunctor<T, WeightType::kNone>(), elem_cnt, dx, input,
            target, dy, ctx->stream()->As<ep::CudaStream>()->cuda_stream())));
      }
    }
  }
  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
 };
 template<typename T>
 user_op::InferTmpSizeFn GenFwInferTmpSizeFn() {
  return [](user_op::InferContext* ctx) {
    const int64_t n = ctx->InputShape("input", 0).elem_cnt();
    size_t tmp_buffer_size = 0;
    if (ctx->Attr<bool>("has_pos_weight")) { tmp_buffer_size += GetCudaAlignedSize(n * sizeof(T)); }
    return tmp_buffer_size;
  };
 }
 template<typename T>
 user_op::InferTmpSizeFn GenBwInferTmpSizeFn() {
  return [](user_op::InferContext* ctx) {
    const int64_t n = ctx->InputShape("target", 0).elem_cnt();
    size_t tmp_buffer_size = 0;
    if (ctx->Attr<bool>("has_pos_weight")) { tmp_buffer_size += GetCudaAlignedSize(n * sizeof(T)); }
    return tmp_buffer_size;
  };
 }
 }  // namespace
 #define REGISTER_BINARY_CROSS_ENTROPY_KERNEL(dtype)                                        \
  REGISTER_USER_KERNEL("binary_cross_entropy_with_logits")                                 \
      .SetCreateFn<BinaryCrossEntropyWithLogitsKernel<dtype>>()                            \
      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA)                     \
                       && (user_op::HobDataType("input", 0) == GetDataType<dtype>::value)  \
                       && (user_op::HobDataType("target", 0) == GetDataType<dtype>::value) \
                       && (user_op::HobDataType("out", 0) == GetDataType<dtype>::value))   \
      .SetInferTmpSizeFn(GenFwInferTmpSizeFn<dtype>());
 #define REGISTER_BINARY_CROSS_ENTROPY_GRAD_KERNEL(dtype)                                   \
  REGISTER_USER_KERNEL("binary_cross_entropy_with_logits_grad")                            \
      .SetCreateFn<BinaryCrossEntropyWithLogitsGradKernel<dtype>>()                        \
      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA)                     \
                       && (user_op::HobDataType("input", 0) == GetDataType<dtype>::value)  \
                       && (user_op::HobDataType("target", 0) == GetDataType<dtype>::value) \
                       && (user_op::HobDataType("dy", 0) == GetDataType<dtype>::value)     \
                       && (user_op::HobDataType("dx", 0) == GetDataType<dtype>::value))    \
      .SetInferTmpSizeFn(GenBwInferTmpSizeFn<dtype>());
 REGISTER_BINARY_CROSS_ENTROPY_KERNEL(half)
 REGISTER_BINARY_CROSS_ENTROPY_KERNEL(float)
 REGISTER_BINARY_CROSS_ENTROPY_KERNEL(double)
 REGISTER_BINARY_CROSS_ENTROPY_GRAD_KERNEL(half)
 REGISTER_BINARY_CROSS_ENTROPY_GRAD_KERNEL(float)
 REGISTER_BINARY_CROSS_ENTROPY_GRAD_KERNEL(double)
 }  // namespace user_op
 }  // namespace oneflow
\ No newline at end of file
--- a/oneflow/user/kernels/binary_cross_entropy_with_logits_mean_kernel.hip.cpp
+++ b/oneflow/user/kernels/binary_cross_entropy_with_logits_mean_kernel.hip.cpp
 #include "hip/hip_runtime.h"
 /*
 Copyright 2020 The OneFlow Authors. All rights reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 */
 #include "oneflow/user/kernels/binary_cross_entropy_with_logits_mean_kernel_util.h"
 #include "oneflow/core/ep/rocm/cuda_stream.h"
 #include "oneflow/core/hip/elementwise.hip.h"
 #include <hipcub/hipcub.hpp>
 #include "oneflow/core/kernel/cuda_graph_support.h"
 namespace oneflow {
 namespace user_op {
 namespace {
 constexpr int32_t kBlockSize = 1024;
 constexpr int32_t kReduceLocalSumBlockSize = 1024;
 constexpr int32_t kSingleBlockProcessNumThreshold = 1024;
 template<typename T>
 struct DefaultComputeType {
  using type = T;
 };
 template<>
 struct DefaultComputeType<half> {
  using type = float;
 };
 template<class Func>
 inline hipError_t GetNumBlocks(Func func, int64_t block_size, size_t dynamic_smem_size,
                                int64_t max_blocks, int64_t waves, int* num_blocks) {
  int dev;
  {
    hipError_t err = hipGetDevice(&dev);
    if (err != hipSuccess) { return err; }
  }
  int sm_count;
  {
    hipError_t err = hipDeviceGetAttribute(&sm_count, hipDeviceAttributeMultiprocessorCount, dev);
    if (err != hipSuccess) { return err; }
  }
  int max_active_blocks;
  {
    hipError_t err = hipOccupancyMaxActiveBlocksPerMultiprocessor(&max_active_blocks, func,
                                                                    block_size, dynamic_smem_size);
  }
  *num_blocks =
      std::max<int>(1, std::min<int64_t>(max_blocks, sm_count * max_active_blocks * waves));
  return hipSuccess;
 }
 template<typename In, typename Out, typename ComputeType>
 __global__ void FusedBinaryCrossEntropyWithLogitsReduceMeanKernel(const In* input, const In* target,
                                                                  Out* out,
                                                                  const int32_t local_elem_cnt,
                                                                  const int32_t reduce_elem_cnt) {
  ComputeType zero = static_cast<ComputeType>(0.0);
  ComputeType one = static_cast<ComputeType>(1.0);
  using BlockReduce = hipcub::BlockReduce<ComputeType, kBlockSize>;
  __shared__ typename BlockReduce::TempStorage temp_storage;
  ComputeType reduce_sum = 0.0;
  CUDA_1D_KERNEL_LOOP(i, local_elem_cnt) {
    const ComputeType input_val = static_cast<ComputeType>(input[i]);
    const ComputeType target_val = static_cast<ComputeType>(target[i]);
    const ComputeType max_val = -input_val < zero ? zero : -input_val;
    const ComputeType result =
        (one - target_val) * input_val + max_val + (log(exp(-max_val) + exp(-input_val - max_val)));
    reduce_sum += result;
  }
  const ComputeType block_reduce_sum = BlockReduce(temp_storage).Sum(reduce_sum);
  if (threadIdx.x == 0) { out[blockIdx.x] = static_cast<Out>(block_reduce_sum / reduce_elem_cnt); }
 }
 template<typename Out, typename ComputeType>
 __global__ void ReduceLocalSumKernel(ComputeType* block_local_sum_buf, Out* out, int64_t elem_cnt) {
  using BlockReduce = hipcub::BlockReduce<ComputeType, kReduceLocalSumBlockSize>;
  __shared__ typename BlockReduce::TempStorage temp_storage;
  ComputeType reduce_sum = 0.0;
  CUDA_1D_KERNEL_LOOP(i, elem_cnt) { reduce_sum += block_local_sum_buf[i]; }
  const ComputeType block_reduce_sum = BlockReduce(temp_storage).Sum(reduce_sum);
  if (threadIdx.x == 0) { out[0] = static_cast<Out>(block_reduce_sum); }
 }
 template<typename T>
 __device__ __forceinline__ T Sigmoid(const T x) {
  const T half_of_one = static_cast<T>(0.5);
  return half_of_one * tanh(half_of_one * x) + half_of_one;
 }
 template<>
 __device__ __forceinline__ half Sigmoid(const half x) {
  return __float2half(Sigmoid(__half2float(x)));
 }
 template<typename T, typename ComputeType>
 struct BinaryCrossEntropyWithLogitsReduceMeanGradFunctor {
  OF_DEVICE_FUNC explicit BinaryCrossEntropyWithLogitsReduceMeanGradFunctor(
      const T elem_cnt_reciprocal, const T dy)
      : elem_cnt_reciprocal(elem_cnt_reciprocal), dy(dy) {}
  __device__ T operator()(const T input_val, const T target_val) const {
    return (Sigmoid(input_val) - target_val) * dy * elem_cnt_reciprocal;
  }
  const T dy;
  const T elem_cnt_reciprocal;
 };
 template<typename T, typename ComputeType>
 struct BinaryCrossEntropyWithLogitsReduceMeanGradDyptrFunctor {
  OF_DEVICE_FUNC explicit BinaryCrossEntropyWithLogitsReduceMeanGradDyptrFunctor(
      const int32_t elem_cnt, const T* dy_ptr)
      : elem_cnt_reciprocal(1.0f / elem_cnt), dy_ptr(dy_ptr) {}
  __device__ BinaryCrossEntropyWithLogitsReduceMeanGradFunctor<T, ComputeType> operator()() const {
    return BinaryCrossEntropyWithLogitsReduceMeanGradFunctor<T, ComputeType>(elem_cnt_reciprocal,
                                                                             *dy_ptr);
  }
  const T* dy_ptr;
  const T elem_cnt_reciprocal;
 };
 template<typename T>
 class BinaryCrossEntropyWithLogitsMeanKernel final : public user_op::OpKernel,
                                                     public CudaGraphSupport {
 public:
  BinaryCrossEntropyWithLogitsMeanKernel() = default;
  ~BinaryCrossEntropyWithLogitsMeanKernel() override = default;
  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
  std::shared_ptr<user_op::OpKernelCache> InitOpKernelCache(
      user_op::KernelCacheContext* ctx) const override {
    return CreateBCEWithLogitsReduceMeanKernelCache(ctx);
  }
 private:
  using user_op::OpKernel::Compute;
  void Compute(user_op::KernelComputeContext* ctx, user_op::OpKernelState* state,
               const user_op::OpKernelCache* cache) const override {
    const auto* input_blob = ctx->Tensor4ArgNameAndIndex("input", 0);
    const auto* target_blob = ctx->Tensor4ArgNameAndIndex("target", 0);
    auto* out_blob = ctx->Tensor4ArgNameAndIndex("out", 0);
    int64_t local_elem_cnt = input_blob->shape_view().elem_cnt();
    int64_t reduce_elem_cnt = local_elem_cnt;
    if (cache != nullptr) {
      // Because `out`'s SBP maybe P or B, we need to use reduce_elem_cnt as reduce_mean factor.
      const auto* bce_cache = dynamic_cast<const BCEWithLogitsReduceMeanKernelCache*>(cache);
      CHECK_NOTNULL(bce_cache);
      reduce_elem_cnt = bce_cache->reduce_elem_cnt();
    }
    const T* input = input_blob->dptr<T>();
    const T* target = target_blob->dptr<T>();
    T* out = out_blob->mut_dptr<T>();
    using ComputeType = typename DefaultComputeType<T>::type;
    if (local_elem_cnt <= kSingleBlockProcessNumThreshold) {
      FusedBinaryCrossEntropyWithLogitsReduceMeanKernel<T, T, ComputeType>
          <<<1, kBlockSize, 0, ctx->stream()->As<ep::CudaStream>()->cuda_stream()>>>(
              input_blob->dptr<T>(), target_blob->dptr<T>(), out_blob->mut_dptr<T>(),
              local_elem_cnt, reduce_elem_cnt);
    } else {
      auto* tmp_buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0);
      const int64_t tmp_buffer_elem_cnt = tmp_buffer->shape_view().elem_cnt() / sizeof(T);
      const int64_t block_num = (local_elem_cnt + kBlockSize - 1) / kBlockSize;
      int launch_block = block_num;
      OF_CUDA_CHECK(GetNumBlocks(
          FusedBinaryCrossEntropyWithLogitsReduceMeanKernel<T, ComputeType, ComputeType>,
          kBlockSize, 0, block_num, 32, &launch_block));
      launch_block = std::min<int32_t>(tmp_buffer_elem_cnt, launch_block);
      FusedBinaryCrossEntropyWithLogitsReduceMeanKernel<T, ComputeType, ComputeType>
          <<<launch_block, kBlockSize, 0, ctx->stream()->As<ep::CudaStream>()->cuda_stream()>>>(
              input_blob->dptr<T>(), target_blob->dptr<T>(), tmp_buffer->mut_dptr<ComputeType>(),
              local_elem_cnt, reduce_elem_cnt);
      ReduceLocalSumKernel<T, ComputeType>
          <<<1, kReduceLocalSumBlockSize, 0, ctx->stream()->As<ep::CudaStream>()->cuda_stream()>>>(
              tmp_buffer->mut_dptr<ComputeType>(), out_blob->mut_dptr<T>(), block_num);
    }
  }
 };
 template<typename T>
 class BinaryCrossEntropyWithLogitsReduceMeanGradKernel final : public user_op::OpKernel {
 public:
  BinaryCrossEntropyWithLogitsReduceMeanGradKernel() = default;
  ~BinaryCrossEntropyWithLogitsReduceMeanGradKernel() = default;
  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
  std::shared_ptr<user_op::OpKernelCache> InitOpKernelCache(
      user_op::KernelCacheContext* ctx) const override {
    return CreateBCEWithLogitsReduceMeanKernelCache(ctx);
  }
 private:
  using user_op::OpKernel::Compute;
  void Compute(user_op::KernelComputeContext* ctx, user_op::OpKernelState* state,
               const user_op::OpKernelCache* cache) const override {
    const auto* input_blob = ctx->Tensor4ArgNameAndIndex("input", 0);
    const auto* target_blob = ctx->Tensor4ArgNameAndIndex("target", 0);
    const auto* dy_blob = ctx->Tensor4ArgNameAndIndex("dy", 0);
    auto* dx_blob = ctx->Tensor4ArgNameAndIndex("dx", 0);
    int64_t local_elem_cnt = input_blob->shape_view().elem_cnt();
    int64_t reduce_elem_cnt = local_elem_cnt;
    if (cache != nullptr) {
      // Because `out`'s SBP maybe P or B, we need to use reduce_elem_cnt as reduce_mean factor.
      const auto* bce_cache = dynamic_cast<const BCEWithLogitsReduceMeanKernelCache*>(cache);
      CHECK_NOTNULL(bce_cache);
      reduce_elem_cnt = bce_cache->reduce_elem_cnt();
    }
    const T* dy = dy_blob->dptr<T>();
    const T* input = input_blob->dptr<T>();
    const T* target = target_blob->dptr<T>();
    T* dx = dx_blob->mut_dptr<T>();
    using ComputeType = typename DefaultComputeType<T>::type;
    OF_CUDA_CHECK((cuda::elementwise::BinaryWithFactory(
        BinaryCrossEntropyWithLogitsReduceMeanGradDyptrFunctor<T, ComputeType>(reduce_elem_cnt, dy),
        local_elem_cnt, dx, input, target, ctx->stream()->As<ep::CudaStream>()->cuda_stream())));
  }
 };
 }  // namespace
 #define REGISTER_BINARY_CROSS_ENTROPY_REDUCE_MEAN_KERNEL(dtype)                                 \
  REGISTER_USER_KERNEL("binary_cross_entropy_with_logits_reduce_mean")                          \
      .SetCreateFn<BinaryCrossEntropyWithLogitsMeanKernel<dtype>>()                             \
      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA)                          \
                       && (user_op::HobDataType("input", 0) == GetDataType<dtype>::value)       \
                       && (user_op::HobDataType("target", 0) == GetDataType<dtype>::value)      \
                       && (user_op::HobDataType("out", 0) == GetDataType<dtype>::value))        \
      .SetInferTmpSizeFn([](user_op::InferContext* ctx) {                                       \
        const int64_t elem_cnt = ctx->InputShape("input", 0).elem_cnt();                        \
        const int64_t block_num = (elem_cnt + kBlockSize - 1) / kBlockSize;                     \
        int launch_block = block_num;                                                           \
        using ComputeType = typename DefaultComputeType<dtype>::type;                           \
        OF_CUDA_CHECK(GetNumBlocks(                                                             \
            FusedBinaryCrossEntropyWithLogitsReduceMeanKernel<dtype, ComputeType, ComputeType>, \
            kBlockSize, 0, block_num, 32, &launch_block));                                      \
        const int64_t tmp_buffer_size = GetCudaAlignedSize(launch_block * sizeof(dtype));       \
        return tmp_buffer_size;                                                                 \
      });
 #define REGISTER_BINARY_CROSS_ENTROPY_REDUCE_MEAN_GRAD_KERNEL(dtype)                       \
  REGISTER_USER_KERNEL("binary_cross_entropy_with_logits_reduce_mean_grad")                \
      .SetCreateFn<BinaryCrossEntropyWithLogitsReduceMeanGradKernel<dtype>>()              \
      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA)                     \
                       && (user_op::HobDataType("input", 0) == GetDataType<dtype>::value)  \
                       && (user_op::HobDataType("target", 0) == GetDataType<dtype>::value) \
                       && (user_op::HobDataType("dy", 0) == GetDataType<dtype>::value)     \
                       && (user_op::HobDataType("dx", 0) == GetDataType<dtype>::value));
 REGISTER_BINARY_CROSS_ENTROPY_REDUCE_MEAN_KERNEL(half)
 REGISTER_BINARY_CROSS_ENTROPY_REDUCE_MEAN_KERNEL(float)
 REGISTER_BINARY_CROSS_ENTROPY_REDUCE_MEAN_KERNEL(double)
 REGISTER_BINARY_CROSS_ENTROPY_REDUCE_MEAN_GRAD_KERNEL(half)
 REGISTER_BINARY_CROSS_ENTROPY_REDUCE_MEAN_GRAD_KERNEL(float)
 REGISTER_BINARY_CROSS_ENTROPY_REDUCE_MEAN_GRAD_KERNEL(double)
 }  // namespace user_op
 }  // namespace oneflow
\ No newline at end of file
--- a/oneflow/user/kernels/broadcast_pow_grad_kernel.hip.cpp
+++ b/oneflow/user/kernels/broadcast_pow_grad_kernel.hip.cpp
 /*
 Copyright 2020 The OneFlow Authors. All rights reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 */
 #include "hip/hip_runtime.h"
 #include "oneflow/core/kernel/new_kernel_util.h"
 #include "oneflow/core/kernel/kernel_util.hip.h"
 #include "oneflow/core/framework/framework.h"
 #include "oneflow/core/ndarray/ndarray_util.h"
 #include "oneflow/core/ndarray/xpu_var_ndarray.h"
 #include "oneflow/core/ep/rocm/cuda_stream.h"
 namespace oneflow {
 namespace {
 template<typename T>
 __global__ void ComputeLogGpu(const int64_t len, T* out, const T* in) {
  CUDA_1D_KERNEL_LOOP(i, len) { out[i] = SafeLog(in[i]); }
 }
 template<>
 __global__ void ComputeLogGpu<float16>(const int64_t len, float16* out, const float16* in) {
  const half* _in = reinterpret_cast<const half*>(in);
  half* _out = reinterpret_cast<half*>(out);
  CUDA_1D_KERNEL_LOOP(i, len) { _out[i] = SafeLog(_in[i]); }
 }
 template<DeviceType device, typename T>
 class BroadcastPowYGradKernel final : public user_op::OpKernel {
 public:
  BroadcastPowYGradKernel() = default;
  ~BroadcastPowYGradKernel() = default;
 private:
  using user_op::OpKernel::Compute;
  void Compute(user_op::KernelComputeContext* ctx) const override {
    const user_op::Tensor* x_tensor = ctx->Tensor4ArgNameAndIndex("x", 0);
    const user_op::Tensor* z_tensor = ctx->Tensor4ArgNameAndIndex("z", 0);
    const user_op::Tensor* dz_tensor = ctx->Tensor4ArgNameAndIndex("dz", 0);
    user_op::Tensor* tmp_buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0);
    user_op::Tensor* dy_tensor = ctx->Tensor4ArgNameAndIndex("dy", 0);
    const int64_t num_axes = dz_tensor->shape_view().NumAxes();
    const int64_t elem_cnt = z_tensor->shape_view().elem_cnt();
    Memset<device>(ctx->stream(), tmp_buffer->mut_dptr<T>(), 0,
                   GetCudaAlignedSize(elem_cnt * sizeof(T)));
    XpuVarNdarray<const T> z(z_tensor->shape_view(), z_tensor->dptr<T>(), num_axes);
    XpuVarNdarray<const T> dz(dz_tensor->shape_view(), dz_tensor->dptr<T>(), num_axes);
    XpuVarNdarray<const T> const_tmp(dz.shape(), tmp_buffer->dptr<T>());
    XpuVarNdarray<T> tmp(dz.shape(), tmp_buffer->mut_dptr<T>());
    XpuVarNdarray<const T> x(x_tensor->shape_view(), x_tensor->dptr<T>(), num_axes);
    XpuVarNdarray<T> dy(dy_tensor->shape_view(), dy_tensor->mut_dptr<T>(), num_axes);
    NdarrayUtil<device, T>::BroadcastAdd(ctx->stream(), tmp, x, const_tmp);
    ComputeLogGpu<T><<<BlocksNum4ThreadsNum(elem_cnt), kCudaThreadsNumPerBlock, 0,
                       ctx->stream()->As<ep::CudaStream>()->cuda_stream()>>>(
        elem_cnt, tmp_buffer->mut_dptr<T>(), tmp_buffer->dptr<T>());
    NdarrayUtil<device, T>::BroadcastMul(ctx->stream(), tmp, dz, const_tmp);
    NdarrayUtil<device, T>::BroadcastMul(ctx->stream(), tmp, z, const_tmp);
    NdarrayUtil<device, T>::ReduceSum(ctx->stream(), dy, const_tmp, tmp);
  }
  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
 };
 }  // namespace
 #define REGISTER_BROADCAST_POW_Y_GRAD_KERNEL(device, dtype_pair)                           \
  REGISTER_USER_KERNEL("broadcast_pow_y_grad")                                             \
      .SetCreateFn<BroadcastPowYGradKernel<device, OF_PP_PAIR_FIRST(dtype_pair)>>()        \
      .SetIsMatchedHob((user_op::HobDeviceType() == device)                                \
                       && (user_op::HobDataType("x", 0) == OF_PP_PAIR_SECOND(dtype_pair))) \
      .SetInferTmpSizeFn([](oneflow::user_op::InferContext* ctx) {                         \
        const user_op::TensorDesc& z = ctx->InputTensorDesc("z", 0);                       \
        const DataType& data_type = z.data_type();                                         \
        const int64_t elem_cnt = z.shape().elem_cnt();                                     \
        return GetCudaAlignedSize(elem_cnt * GetSizeOfDataType(data_type));                \
      });
 OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(REGISTER_BROADCAST_POW_Y_GRAD_KERNEL, (DeviceType::kCUDA),
                                 ARITHMETIC_DATA_TYPE_SEQ FLOAT16_DATA_TYPE_SEQ)
 }  // namespace oneflow
\ No newline at end of file
--- a/oneflow/user/kernels/categorical_ordinal_encode_kernel_util.hip.cpp
+++ b/oneflow/user/kernels/categorical_ordinal_encode_kernel_util.hip.cpp
 #include "hip/hip_runtime.h"
 /*
 Copyright 2020 The OneFlow Authors. All rights reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 */
 #ifdef NDEBUG
 #undef NDEBUG
 #endif
 #include <assert.h>
 #include "oneflow/user/kernels/categorical_ordinal_encode_kernel_util.h"
 #include "oneflow/core/kernel/kernel_util.hip.h"
 #include "oneflow/core/ep/rocm/cuda_stream.h"
 namespace oneflow {
 namespace {
 using CuInt64T = unsigned long long int;
 __device__ __inline__ int32_t AtomicCAS(int32_t* address, int32_t compare, int32_t val) {
  return atomicCAS(address, compare, val);
 }
 __device__ __inline__ int64_t AtomicCAS(int64_t* address, int64_t compare, int64_t val) {
  static_assert(sizeof(int64_t) == sizeof(CuInt64T), "size error");
  return static_cast<int64_t>(atomicCAS(reinterpret_cast<CuInt64T*>(address),
                                        static_cast<CuInt64T>(compare),
                                        static_cast<CuInt64T>(val)));
 }
 __device__ __inline__ int32_t AtomicAdd(int32_t* address, int32_t val) {
  return atomicAdd(address, val);
 }
 __device__ __inline__ int64_t AtomicAdd(int64_t* address, int64_t val) {
  static_assert(sizeof(int64_t) == sizeof(CuInt64T), "size error");
  return static_cast<int64_t>(
      atomicAdd(reinterpret_cast<CuInt64T*>(address), static_cast<CuInt64T>(val)));
 }
 template<typename K, typename V>
 __device__ bool TryGetOrInsert(K* key, volatile V* value, V* size, const K hash, V* out) {
  K old_key = AtomicCAS(key, static_cast<K>(0), hash);
  if (old_key == 0) {
    V v = AtomicAdd(size, 1) + 1;
    *value = v;
    *out = v;
    return true;
  } else if (old_key == hash) {
    while (true) {
      V v = *value;
      if (v != 0) {
        *out = v;
        break;
      }
    }
    return true;
  } else {
    return false;
  }
 }
 template<typename T>
 __device__ bool GetOrInsertOne(const size_t capacity, T* table, T* size, const T hash, T* out) {
  if (hash == 0) {
    *out = 0;
    return true;
  }
  const size_t start_idx = static_cast<size_t>(hash) % capacity;
  // fast path
  {
    T* key = table + start_idx * 2;
    T* value = key + 1;
    if (*key == hash && *value != 0) {
      *out = *value;
      return true;
    }
  }
  for (size_t count = 0; count < capacity; ++count) {
    const size_t idx = (start_idx + count) % capacity;
    T* key = table + idx * 2;
    T* value = key + 1;
    if (TryGetOrInsert<T, T>(key, value, size, hash, out)) { return true; }
  }
  return false;
 }
 template<typename T>
 __global__ void EncodeGpu(const size_t capacity, T* table, T* size, const int64_t n, const T* hash,
                          T* out) {
  CUDA_1D_KERNEL_LOOP(i, n) {
    bool success = GetOrInsertOne<T>(capacity, table, size, hash[i], out + i);
    assert(success);
  }
 }
 }  // namespace
 template<typename T>
 struct CategoricalOrdinalEncodeKernelUtil<DeviceType::kCUDA, T> {
  static void Encode(ep::Stream* stream, int64_t capacity, T* table, T* size, int64_t n,
                     const T* hash, T* out) {
    EncodeGpu<T>
        <<<BlocksNum4ThreadsNum(n), kCudaThreadsNumPerBlock, 0,
           stream->As<ep::CudaStream>()->cuda_stream()>>>(capacity, table, size, n, hash, out);
  }
 };
 #define INSTANTIATE_CATEGORICAL_ORDINAL_ENCODE_KERNEL_UTIL_CUDA(type_cpp, type_proto) \
  template struct CategoricalOrdinalEncodeKernelUtil<DeviceType::kCUDA, type_cpp>;
 OF_PP_FOR_EACH_TUPLE(INSTANTIATE_CATEGORICAL_ORDINAL_ENCODE_KERNEL_UTIL_CUDA, INDEX_DATA_TYPE_SEQ);
 #undef INSTANTIATE_CATEGORICAL_ORDINAL_ENCODE_KERNEL_UTIL_CUDA
 }  // namespace oneflow
\ No newline at end of file