dtk

8f7de847 · yuguo960516yuguo · f262efc9 · 8f7de847 · 8f7de847 · 8f7de847
Commit 8f7de847 authored Apr 25, 2023 by yuguo960516yuguo
20 changed files
--- a/oneflow/core/ndarray/xpu_ndarray_assign.hip.cpp
+++ b/oneflow/core/ndarray/xpu_ndarray_assign.hip.cpp
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#include "hip/hip_runtime.h"
-#include "oneflow/core/ndarray/ndarray_assign_core.h"
-#include "oneflow/core/device/cuda_util.h"
-#include "oneflow/core/kernel/kernel_util.h"
-
-namespace oneflow {
-
-namespace {
-
-template<typename T, typename X, int NDIMS>
-__global__ void NdarrayAssignReducedGpu(XpuVarNdarray<T> y,
-                                        const XpuReducedNdarray<X, NDIMS> reduced) {
-  NdarrayAssignCore<T, X, NDIMS>::Assign(y, reduced);
-}
-
-template<typename T, typename X, int NDIMS>
-__global__ void NdarrayAssignGpu(XpuVarNdarray<T> y, const XpuVarNdarray<const X> x) {
-  NdarrayAssignCore<T, X, NDIMS>::Assign(y, x);
-}
-
-}  // namespace
-
-template<typename T, typename X, int NDIMS>
-struct NdarrayAssignCoreWrapper<DeviceType::kCUDA, T, X, NDIMS> final {
-  static void Assign(ep::Stream* stream, XpuVarNdarray<T>* y,
-                     const XpuReducedNdarray<X, NDIMS>& reduced) {
-    size_t n = y->host_shape().HostElemNum();
-    RUN_CUDA_KERNEL((NdarrayAssignReducedGpu<T, X, NDIMS>), stream, n, *y, reduced);
-  }
-  static void Assign(ep::Stream* ctx, const XpuVarNdarray<T>& y, const XpuVarNdarray<const X>& x) {
-    size_t n = y.host_shape().HostElemNum();
-    if (n == 0) { return; }
-    RUN_CUDA_KERNEL((NdarrayAssignGpu<T, X, NDIMS>), ctx, n, y, x);
-  }
-};
-
-#define INSTANTIATE_NDARRAY_ASSIGN(ret_dtype_pair, dtype_pair, NDIMS)                           \
-  template struct NdarrayAssignCoreWrapper<DeviceType::kCUDA, OF_PP_PAIR_FIRST(ret_dtype_pair), \
-                                           OF_PP_PAIR_FIRST(dtype_pair), NDIMS>;
-OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(
-    INSTANTIATE_NDARRAY_ASSIGN,
-    ARITHMETIC_DATA_TYPE_SEQ UNSIGNED_INT_DATA_TYPE_SEQ BOOL_DATA_TYPE_SEQ,
-    ARITHMETIC_DATA_TYPE_SEQ UNSIGNED_INT_DATA_TYPE_SEQ BOOL_DATA_TYPE_SEQ, DIM_SEQ);
-OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INSTANTIATE_NDARRAY_ASSIGN, HALF_DATA_TYPE_SEQ, HALF_DATA_TYPE_SEQ,
-                                 DIM_SEQ);
-
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "hip/hip_runtime.h"
+#include "oneflow/core/ndarray/ndarray_assign_core.h"
+#include "oneflow/core/device/cuda_util.h"
+#include "oneflow/core/kernel/kernel_util.h"
+
+namespace oneflow {
+
+namespace {
+
+template<typename T, typename X, int NDIMS>
+__global__ void NdarrayAssignReducedGpu(XpuVarNdarray<T> y,
+                                        const XpuReducedNdarray<X, NDIMS> reduced) {
+  NdarrayAssignCore<T, X, NDIMS>::Assign(y, reduced);
+}
+
+template<typename T, typename X, int NDIMS>
+__global__ void NdarrayAssignGpu(XpuVarNdarray<T> y, const XpuVarNdarray<const X> x) {
+  NdarrayAssignCore<T, X, NDIMS>::Assign(y, x);
+}
+
+}  // namespace
+
+template<typename T, typename X, int NDIMS>
+struct NdarrayAssignCoreWrapper<DeviceType::kCUDA, T, X, NDIMS> final {
+  static void Assign(ep::Stream* stream, XpuVarNdarray<T>* y,
+                     const XpuReducedNdarray<X, NDIMS>& reduced) {
+    size_t n = y->host_shape().HostElemNum();
+    RUN_CUDA_KERNEL((NdarrayAssignReducedGpu<T, X, NDIMS>), stream, n, *y, reduced);
+  }
+  static void Assign(ep::Stream* ctx, const XpuVarNdarray<T>& y, const XpuVarNdarray<const X>& x) {
+    size_t n = y.host_shape().HostElemNum();
+    if (n == 0) { return; }
+    RUN_CUDA_KERNEL((NdarrayAssignGpu<T, X, NDIMS>), ctx, n, y, x);
+  }
+};
+
+#define INSTANTIATE_NDARRAY_ASSIGN(ret_dtype_pair, dtype_pair, NDIMS)                           \
+  template struct NdarrayAssignCoreWrapper<DeviceType::kCUDA, OF_PP_PAIR_FIRST(ret_dtype_pair), \
+                                           OF_PP_PAIR_FIRST(dtype_pair), NDIMS>;
+OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(
+    INSTANTIATE_NDARRAY_ASSIGN,
+    ARITHMETIC_DATA_TYPE_SEQ UNSIGNED_INT_DATA_TYPE_SEQ BOOL_DATA_TYPE_SEQ,
+    ARITHMETIC_DATA_TYPE_SEQ UNSIGNED_INT_DATA_TYPE_SEQ BOOL_DATA_TYPE_SEQ, DIM_SEQ);
+OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INSTANTIATE_NDARRAY_ASSIGN, HALF_DATA_TYPE_SEQ, HALF_DATA_TYPE_SEQ,
+                                 DIM_SEQ);
+
 }  // namespace oneflow
\ No newline at end of file
--- a/oneflow/core/profiler/event.cpp
+++ b/oneflow/core/profiler/event.cpp
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-#include "fmt/core.h"
-#include "fmt/format.h"
-#include "oneflow/core/profiler/event.h"
-#include "oneflow/core/profiler/util.h"
-
-using json = nlohmann::json;
-
-namespace oneflow {
-
-namespace profiler {
-nlohmann::json IEvent::ToJson() {
-  return json{{"name", name_}, {"time", GetDuration<double>()}, {"input_shapes", "-"}};
-}
-
-void IEvent::SetStartedAt(double t) { started_at_ = t; }
-
-void IEvent::SetFinishedAt(double t) { finished_at_ = t; }
-
-void IEvent::Start() { SetStartedAt(GetTimeNow()); }
-
-void IEvent::Finish() { SetFinishedAt(GetTimeNow()); }
-
-bool IEvent::IsChildOf(const IEvent* e) {
-  if (!e) { return false; }
-  if (this == e) { return false; }
-  return GetStartedAt<double>() >= e->GetStartedAt<double>()
-         && GetFinishedAt<double>() <= e->GetFinishedAt<double>();
-}
-
-const std::string& IEvent::GetName() const { return name_; }
-
-std::string CustomEvent::Key() { return name_; }
-
-nlohmann::json CustomEvent::ToJson() {
-  auto j = IEvent::ToJson();
-  j["type"] = EventType::kCustom;
-  j["custom_type"] = type_;
-  return j;
-}
-
-std::shared_ptr<CustomEvent> CustomEvent::Create(const std::string& name, CustomEventType type) {
-  return std::shared_ptr<CustomEvent>(new CustomEvent(name, type));
-}
-
-std::string KernelEvent::Key() { return fmt::format("{}.{}", name_, GetFormatedInputShapes()); }
-
-nlohmann::json KernelEvent::ToJson() {
-  auto j = IEvent::ToJson();
-  j["type"] = EventType::kOneflowKernel;
-  j["input_shapes"] = GetFormatedInputShapes();
-#if defined(WITH_CUDA) || defined(WITH_ROCM)
-  j["memory_size"] = memory_size_;
-  if (!children_.empty()) { j["children"] = children_; }
-#endif  // WITH_CUDA
-  return j;
-}
-
-std::shared_ptr<KernelEvent> KernelEvent::Create(
-    const std::string& name, const std::function<std::vector<Shape>(void)>& shape_getter) {
-  return std::shared_ptr<KernelEvent>(new KernelEvent(name, shape_getter));
-}
-
-std::string KernelEvent::GetFormatedInputShapes(size_t max_num_to_format) {
-  if (input_shapes_.size() == 0) { return "-"; }
-  std::vector<std::string> shapes_formated(std::min(input_shapes_.size(), max_num_to_format));
-  for (auto i = 0; i < shapes_formated.size(); ++i) {
-    const std::string current_shape = input_shapes_[i].ToString();
-    shapes_formated[i] = current_shape == "()" ? "scalar" : current_shape;
-  }
-  if (input_shapes_.size() > max_num_to_format) { shapes_formated.emplace_back("..."); }
-  return fmt::format("[{}]", fmt::join(shapes_formated, ", "));
-}
-
-}  // namespace profiler
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+#include "fmt/core.h"
+#include "fmt/format.h"
+#include "oneflow/core/profiler/event.h"
+#include "oneflow/core/profiler/util.h"
+
+using json = nlohmann::json;
+
+namespace oneflow {
+
+namespace profiler {
+nlohmann::json IEvent::ToJson() {
+  return json{{"name", name_}, {"time", GetDuration<double>()}, {"input_shapes", "-"}};
+}
+
+void IEvent::SetStartedAt(double t) { started_at_ = t; }
+
+void IEvent::SetFinishedAt(double t) { finished_at_ = t; }
+
+void IEvent::Start() { SetStartedAt(GetTimeNow()); }
+
+void IEvent::Finish() { SetFinishedAt(GetTimeNow()); }
+
+bool IEvent::IsChildOf(const IEvent* e) {
+  if (!e) { return false; }
+  if (this == e) { return false; }
+  return GetStartedAt<double>() >= e->GetStartedAt<double>()
+         && GetFinishedAt<double>() <= e->GetFinishedAt<double>();
+}
+
+const std::string& IEvent::GetName() const { return name_; }
+
+std::string CustomEvent::Key() { return name_; }
+
+nlohmann::json CustomEvent::ToJson() {
+  auto j = IEvent::ToJson();
+  j["type"] = EventType::kCustom;
+  j["custom_type"] = type_;
+  return j;
+}
+
+std::shared_ptr<CustomEvent> CustomEvent::Create(const std::string& name, CustomEventType type) {
+  return std::shared_ptr<CustomEvent>(new CustomEvent(name, type));
+}
+
+std::string KernelEvent::Key() { return fmt::format("{}.{}", name_, GetFormatedInputShapes()); }
+
+nlohmann::json KernelEvent::ToJson() {
+  auto j = IEvent::ToJson();
+  j["type"] = EventType::kOneflowKernel;
+  j["input_shapes"] = GetFormatedInputShapes();
+#if defined(WITH_CUDA) || defined(WITH_ROCM)
+  j["memory_size"] = memory_size_;
+  if (!children_.empty()) { j["children"] = children_; }
+#endif  // WITH_CUDA
+  return j;
+}
+
+std::shared_ptr<KernelEvent> KernelEvent::Create(
+    const std::string& name, const std::function<std::vector<Shape>(void)>& shape_getter) {
+  return std::shared_ptr<KernelEvent>(new KernelEvent(name, shape_getter));
+}
+
+std::string KernelEvent::GetFormatedInputShapes(size_t max_num_to_format) {
+  if (input_shapes_.size() == 0) { return "-"; }
+  std::vector<std::string> shapes_formated(std::min(input_shapes_.size(), max_num_to_format));
+  for (auto i = 0; i < shapes_formated.size(); ++i) {
+    const std::string current_shape = input_shapes_[i].ToString();
+    shapes_formated[i] = current_shape == "()" ? "scalar" : current_shape;
+  }
+  if (input_shapes_.size() > max_num_to_format) { shapes_formated.emplace_back("..."); }
+  return fmt::format("[{}]", fmt::join(shapes_formated, ", "));
+}
+
+}  // namespace profiler
 }  // namespace oneflow
\ No newline at end of file
--- a/oneflow/core/profiler/event.h
+++ b/oneflow/core/profiler/event.h
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#ifndef ONEFLOW_CORE_PROFILER_EVENT_H_
-#define ONEFLOW_CORE_PROFILER_EVENT_H_
-
-#include <functional>
-#include <memory>
-#include <vector>
-#include "nlohmann/json.hpp"
-#include "oneflow/core/common/util.h"
-#include "oneflow/core/common/shape_view.h"
-
-namespace oneflow {
-
-namespace profiler {
-
-class ProfileManager;
-
-enum class EventType {
-  kCustom,        // has three kinds
-  kOneflowKernel  // OneFlow cpu/cuda kernel
-};
-enum class CustomEventType {
-  kDefault,     // for record_function
-  kCudaKernel,  // cuda kernel
-  kCudaRuntime  // something like cudaLaunchKernel
-};
-enum class EventTimeUnit { kNS, kUS };
-
-class IEvent {
- public:
-  OF_DISALLOW_COPY_AND_MOVE(IEvent);
-
-  IEvent() = delete;
-  IEvent(const std::string& name, EventTimeUnit time_unit) : name_(name), time_unit_(time_unit) {}
-
-  virtual std::string Key() = 0;
-  virtual nlohmann::json ToJson();
-  virtual ~IEvent() = default;
-
-  virtual void Start();
-  virtual void Finish();
-  bool IsChildOf(const IEvent* e);
-
-  const std::string& GetName() const;
-  template<typename T>
-  const T GetDuration(EventTimeUnit time_unit = EventTimeUnit::kUS) const;
-  template<typename T>
-  const T GetStartedAt(EventTimeUnit time_unit = EventTimeUnit::kUS) const;
-  template<typename T>
-  const T GetFinishedAt(EventTimeUnit time_unit = EventTimeUnit::kUS) const;
-
- protected:
-  virtual void SetStartedAt(double t);
-  virtual void SetFinishedAt(double t);
-
-  std::string name_;
-  EventTimeUnit time_unit_;
-  double started_at_ = 0;
-  double finished_at_ = 0;
-};
-
-inline double ConvertTime(double time_, EventTimeUnit src_time_unit, EventTimeUnit dst_time_unit) {
-  if (src_time_unit == EventTimeUnit::kNS && dst_time_unit == EventTimeUnit::kUS) {
-    return time_ / 1000;
-  }
-  if (src_time_unit == EventTimeUnit::kUS && dst_time_unit == EventTimeUnit::kNS) {
-    return time_ * 1000;
-  }
-  return time_;
-}
-
-template<>
-const inline double IEvent::GetStartedAt<double>(EventTimeUnit time_unit) const {
-  return ConvertTime(started_at_, time_unit_, time_unit);
-}
-
-template<>
-const inline time_t IEvent::GetStartedAt<time_t>(EventTimeUnit time_unit) const {
-  return static_cast<time_t>(GetStartedAt<double>(time_unit));
-}
-
-template<>
-const inline double IEvent::GetFinishedAt<double>(EventTimeUnit time_unit) const {
-  return ConvertTime(finished_at_, time_unit_, time_unit);
-}
-
-template<>
-const inline time_t IEvent::GetFinishedAt<time_t>(EventTimeUnit time_unit) const {
-  return static_cast<time_t>(GetFinishedAt<double>(time_unit));
-}
-
-template<>
-const inline double IEvent::GetDuration<double>(EventTimeUnit time_unit) const {
-  return GetFinishedAt<double>(time_unit) - GetStartedAt<double>(time_unit);
-}
-
-template<>
-const inline time_t IEvent::GetDuration<time_t>(EventTimeUnit time_unit) const {
-  return static_cast<time_t>(GetDuration<double>(time_unit));
-}
-
-class CustomEvent final : public IEvent {
- public:
-  friend class ProfileManager;
-  std::string Key() override;
-
-  nlohmann::json ToJson() override;
-
-  static std::shared_ptr<CustomEvent> Create(const std::string& name,
-                                             CustomEventType type = CustomEventType::kDefault);
-
- private:
-  CustomEventType type_;
-  CustomEvent(const std::string& custom_name, CustomEventType type)
-      : IEvent(custom_name,
-               type == CustomEventType::kDefault ? EventTimeUnit::kNS : EventTimeUnit::kUS),
-        type_(type) {}
-};
-
-class KernelEvent final : public IEvent {
- public:
-  std::string Key() override;
-
-  nlohmann::json ToJson() override;
-
-  static std::shared_ptr<KernelEvent> Create(
-      const std::string& name, const std::function<std::vector<Shape>(void)>& shape_getter);
-
-#if defined(WITH_CUDA) || defined(WITH_ROCM)
-  void SetMemorySize(int64_t memory_size) { memory_size_ = memory_size; }
-  void AddChildEvent(const std::shared_ptr<IEvent>& e) { children_.emplace(e); }
-  bool AddChildEventIfSo(const std::shared_ptr<IEvent>& e) {
-    if (e->IsChildOf(dynamic_cast<IEvent*>(this))) {
-      children_.emplace(e);
-      return true;
-    }
-    return false;
-  }
-  bool HasChildEvent(const std::shared_ptr<IEvent>& e) { return children_.count(e); }
-  void WalkAmongChildren(const std::function<void(const std::shared_ptr<IEvent>& e)>& f) const {
-    for (const auto& x : children_) { f(x); }
-  }
-#endif  // WITH_CUDA
-
- private:
-  KernelEvent(const std::string& kernel_name,
-              const std::function<std::vector<Shape>(void)>& shape_getter)
-      : IEvent(kernel_name, EventTimeUnit::kNS) {
-    if (shape_getter) { input_shapes_ = shape_getter(); }
-  }
-
-#if defined(WITH_CUDA) || defined(WITH_ROCM)
-  int64_t memory_size_ = -1;
-  std::set<std::shared_ptr<IEvent>> children_;
-#endif  // WITH_CUDA
-
-  std::vector<Shape> input_shapes_;
-  std::string GetFormatedInputShapes(size_t max_num_to_format = 4);
-};
-
-}  // namespace profiler
-}  // namespace oneflow
-
-namespace nlohmann {
-
-inline void to_json(json& j, const std::shared_ptr<::oneflow::profiler::IEvent>& event) {
-  j = event->ToJson();
-}
-
-}  // namespace nlohmann
-
-#endif  // ONEFLOW_CORE_PROFILER_EVENT_H_
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#ifndef ONEFLOW_CORE_PROFILER_EVENT_H_
+#define ONEFLOW_CORE_PROFILER_EVENT_H_
+
+#include <functional>
+#include <memory>
+#include <vector>
+#include "nlohmann/json.hpp"
+#include "oneflow/core/common/util.h"
+#include "oneflow/core/common/shape_view.h"
+
+namespace oneflow {
+
+namespace profiler {
+
+class ProfileManager;
+
+enum class EventType {
+  kCustom,        // has three kinds
+  kOneflowKernel  // OneFlow cpu/cuda kernel
+};
+enum class CustomEventType {
+  kDefault,     // for record_function
+  kCudaKernel,  // cuda kernel
+  kCudaRuntime  // something like cudaLaunchKernel
+};
+enum class EventTimeUnit { kNS, kUS };
+
+class IEvent {
+ public:
+  OF_DISALLOW_COPY_AND_MOVE(IEvent);
+
+  IEvent() = delete;
+  IEvent(const std::string& name, EventTimeUnit time_unit) : name_(name), time_unit_(time_unit) {}
+
+  virtual std::string Key() = 0;
+  virtual nlohmann::json ToJson();
+  virtual ~IEvent() = default;
+
+  virtual void Start();
+  virtual void Finish();
+  bool IsChildOf(const IEvent* e);
+
+  const std::string& GetName() const;
+  template<typename T>
+  const T GetDuration(EventTimeUnit time_unit = EventTimeUnit::kUS) const;
+  template<typename T>
+  const T GetStartedAt(EventTimeUnit time_unit = EventTimeUnit::kUS) const;
+  template<typename T>
+  const T GetFinishedAt(EventTimeUnit time_unit = EventTimeUnit::kUS) const;
+
+ protected:
+  virtual void SetStartedAt(double t);
+  virtual void SetFinishedAt(double t);
+
+  std::string name_;
+  EventTimeUnit time_unit_;
+  double started_at_ = 0;
+  double finished_at_ = 0;
+};
+
+inline double ConvertTime(double time_, EventTimeUnit src_time_unit, EventTimeUnit dst_time_unit) {
+  if (src_time_unit == EventTimeUnit::kNS && dst_time_unit == EventTimeUnit::kUS) {
+    return time_ / 1000;
+  }
+  if (src_time_unit == EventTimeUnit::kUS && dst_time_unit == EventTimeUnit::kNS) {
+    return time_ * 1000;
+  }
+  return time_;
+}
+
+template<>
+const inline double IEvent::GetStartedAt<double>(EventTimeUnit time_unit) const {
+  return ConvertTime(started_at_, time_unit_, time_unit);
+}
+
+template<>
+const inline time_t IEvent::GetStartedAt<time_t>(EventTimeUnit time_unit) const {
+  return static_cast<time_t>(GetStartedAt<double>(time_unit));
+}
+
+template<>
+const inline double IEvent::GetFinishedAt<double>(EventTimeUnit time_unit) const {
+  return ConvertTime(finished_at_, time_unit_, time_unit);
+}
+
+template<>
+const inline time_t IEvent::GetFinishedAt<time_t>(EventTimeUnit time_unit) const {
+  return static_cast<time_t>(GetFinishedAt<double>(time_unit));
+}
+
+template<>
+const inline double IEvent::GetDuration<double>(EventTimeUnit time_unit) const {
+  return GetFinishedAt<double>(time_unit) - GetStartedAt<double>(time_unit);
+}
+
+template<>
+const inline time_t IEvent::GetDuration<time_t>(EventTimeUnit time_unit) const {
+  return static_cast<time_t>(GetDuration<double>(time_unit));
+}
+
+class CustomEvent final : public IEvent {
+ public:
+  friend class ProfileManager;
+  std::string Key() override;
+
+  nlohmann::json ToJson() override;
+
+  static std::shared_ptr<CustomEvent> Create(const std::string& name,
+                                             CustomEventType type = CustomEventType::kDefault);
+
+ private:
+  CustomEventType type_;
+  CustomEvent(const std::string& custom_name, CustomEventType type)
+      : IEvent(custom_name,
+               type == CustomEventType::kDefault ? EventTimeUnit::kNS : EventTimeUnit::kUS),
+        type_(type) {}
+};
+
+class KernelEvent final : public IEvent {
+ public:
+  std::string Key() override;
+
+  nlohmann::json ToJson() override;
+
+  static std::shared_ptr<KernelEvent> Create(
+      const std::string& name, const std::function<std::vector<Shape>(void)>& shape_getter);
+
+#if defined(WITH_CUDA) || defined(WITH_ROCM)
+  void SetMemorySize(int64_t memory_size) { memory_size_ = memory_size; }
+  void AddChildEvent(const std::shared_ptr<IEvent>& e) { children_.emplace(e); }
+  bool AddChildEventIfSo(const std::shared_ptr<IEvent>& e) {
+    if (e->IsChildOf(dynamic_cast<IEvent*>(this))) {
+      children_.emplace(e);
+      return true;
+    }
+    return false;
+  }
+  bool HasChildEvent(const std::shared_ptr<IEvent>& e) { return children_.count(e); }
+  void WalkAmongChildren(const std::function<void(const std::shared_ptr<IEvent>& e)>& f) const {
+    for (const auto& x : children_) { f(x); }
+  }
+#endif  // WITH_CUDA
+
+ private:
+  KernelEvent(const std::string& kernel_name,
+              const std::function<std::vector<Shape>(void)>& shape_getter)
+      : IEvent(kernel_name, EventTimeUnit::kNS) {
+    if (shape_getter) { input_shapes_ = shape_getter(); }
+  }
+
+#if defined(WITH_CUDA) || defined(WITH_ROCM)
+  int64_t memory_size_ = -1;
+  std::set<std::shared_ptr<IEvent>> children_;
+#endif  // WITH_CUDA
+
+  std::vector<Shape> input_shapes_;
+  std::string GetFormatedInputShapes(size_t max_num_to_format = 4);
+};
+
+}  // namespace profiler
+}  // namespace oneflow
+
+namespace nlohmann {
+
+inline void to_json(json& j, const std::shared_ptr<::oneflow::profiler::IEvent>& event) {
+  j = event->ToJson();
+}
+
+}  // namespace nlohmann
+
+#endif  // ONEFLOW_CORE_PROFILER_EVENT_H_
--- a/oneflow/core/profiler/event_recorder.h
+++ b/oneflow/core/profiler/event_recorder.h
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#ifndef ONEFLOW_CORE_PROFILER_EVENT_RECORDER_H_
-#define ONEFLOW_CORE_PROFILER_EVENT_RECORDER_H_
-
-#include "oneflow/core/common/util.h"
-#include "oneflow/core/profiler/event.h"
-
-namespace oneflow {
-namespace profiler {
-
-class EventRecorder {
- public:
-  using ShapeGetterFuncType = std::function<std::vector<Shape>(void)>;
-
-  OF_DISALLOW_COPY_AND_MOVE(EventRecorder);
-
-  explicit EventRecorder(const std::shared_ptr<IEvent>& event) : event_(event) {
-    CHECK_JUST(RegisterEventToProfileManager(event));
-    event_->Start();
-  }
-
-  Maybe<void> RegisterEventToProfileManager(const std::shared_ptr<IEvent>& event);
-
-  ~EventRecorder() {
-    if (event_) {
-      event_->Finish();
-      event_.reset();
-    }
-  }
-  static std::shared_ptr<EventRecorder> CreateCustomEventRecorder(const std::string& name);
-
-  static Maybe<EventRecorder> CreateKernelEventRecorder(
-      const std::string& name,
-#if defined(WITH_CUDA) || defined(WITH_ROCM)
-      const std::function<int64_t()>& memory_size_getter,
-#endif
-      const ShapeGetterFuncType& shape_getter);
-
- private:
-  std::shared_ptr<IEvent> event_;
-};
-
-}  // namespace profiler
-}  // namespace oneflow
-
-#endif  // ONEFLOW_CORE_PROFILER_EVENT_RECORDER_H_
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#ifndef ONEFLOW_CORE_PROFILER_EVENT_RECORDER_H_
+#define ONEFLOW_CORE_PROFILER_EVENT_RECORDER_H_
+
+#include "oneflow/core/common/util.h"
+#include "oneflow/core/profiler/event.h"
+
+namespace oneflow {
+namespace profiler {
+
+class EventRecorder {
+ public:
+  using ShapeGetterFuncType = std::function<std::vector<Shape>(void)>;
+
+  OF_DISALLOW_COPY_AND_MOVE(EventRecorder);
+
+  explicit EventRecorder(const std::shared_ptr<IEvent>& event) : event_(event) {
+    CHECK_JUST(RegisterEventToProfileManager(event));
+    event_->Start();
+  }
+
+  Maybe<void> RegisterEventToProfileManager(const std::shared_ptr<IEvent>& event);
+
+  ~EventRecorder() {
+    if (event_) {
+      event_->Finish();
+      event_.reset();
+    }
+  }
+  static std::shared_ptr<EventRecorder> CreateCustomEventRecorder(const std::string& name);
+
+  static Maybe<EventRecorder> CreateKernelEventRecorder(
+      const std::string& name,
+#if defined(WITH_CUDA) || defined(WITH_ROCM)
+      const std::function<int64_t()>& memory_size_getter,
+#endif
+      const ShapeGetterFuncType& shape_getter);
+
+ private:
+  std::shared_ptr<IEvent> event_;
+};
+
+}  // namespace profiler
+}  // namespace oneflow
+
+#endif  // ONEFLOW_CORE_PROFILER_EVENT_RECORDER_H_
--- a/oneflow/core/vm/sync_vm_mode_guard.h
+++ b/oneflow/core/vm/sync_vm_mode_guard.h
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#ifndef ONEFLOW_CORE_VM_SYNC_VM_MODE_GUARD_H_
-#define ONEFLOW_CORE_VM_SYNC_VM_MODE_GUARD_H_
-
-#include "oneflow/core/common/thread_local_guard.h"
-
-namespace oneflow {
-
-enum class SyncVmMode {
-  kInvalid = 0,
-  kEnable = 1,
-  kDisable = 2,
-};
-
-class SyncVmModeGuard final : public ThreadLocalGuard<SyncVmMode> {
- public:
-  using ThreadLocalGuard<SyncVmMode>::ThreadLocalGuard;
-  ~SyncVmModeGuard() = default;
-
-  static bool IsCurrentSyncVmMode() {
-    const auto& opt_sync_mode = Current();
-    return opt_sync_mode.has_value() && CHECK_JUST(opt_sync_mode) == SyncVmMode::kEnable;
-  }
-};
-
-}  // namespace oneflow
-
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#ifndef ONEFLOW_CORE_VM_SYNC_VM_MODE_GUARD_H_
+#define ONEFLOW_CORE_VM_SYNC_VM_MODE_GUARD_H_
+
+#include "oneflow/core/common/thread_local_guard.h"
+
+namespace oneflow {
+
+enum class SyncVmMode {
+  kInvalid = 0,
+  kEnable = 1,
+  kDisable = 2,
+};
+
+class SyncVmModeGuard final : public ThreadLocalGuard<SyncVmMode> {
+ public:
+  using ThreadLocalGuard<SyncVmMode>::ThreadLocalGuard;
+  ~SyncVmModeGuard() = default;
+
+  static bool IsCurrentSyncVmMode() {
+    const auto& opt_sync_mode = Current();
+    return opt_sync_mode.has_value() && CHECK_JUST(opt_sync_mode) == SyncVmMode::kEnable;
+  }
+};
+
+}  // namespace oneflow
+
 #endif  // ONEFLOW_CORE_VM_SYNC_VM_MODE_GUARD_H_
\ No newline at end of file
--- a/oneflow/user/kernels/adaptive_pool_gpu_kernel.hip.cpp
+++ b/oneflow/user/kernels/adaptive_pool_gpu_kernel.hip.cpp
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#include "hip/hip_runtime.h"
-#include "oneflow/core/device/cuda_util.h"
-#include "oneflow/core/framework/framework.h"
-#include "oneflow/core/kernel/kernel_util.hip.h"
-#include "oneflow/core/common/data_type.h"
-#include "oneflow/core/kernel/util/cuda_half_util.h"
-#include "oneflow/core/hip/atomic.hip.h"
-#include "oneflow/core/operator/operator_util.h"
-#include "oneflow/user/utils/pool_util.h"
-
-#include <algorithm>
-#include <cfloat>
-#include <cmath>
-
-namespace oneflow {
-
-namespace user_op {
-
-#define START_IND(a, b, c) (int)std::floor((float)(a * c) / b)
-#define END_IND(a, b, c) (int)std::ceil((float)((a + 1) * c) / b)
-
-#define START_IND_INT(a, b, c) ((a * c) / b)
-#define END_IND_INT(a, b, c) (((a + 1) * c + b - 1) / b)
-
-template<typename T>
-__global__ void InitPtr(int elements, T* ptr) {
-  int gid = (blockDim.x * blockIdx.x) + threadIdx.x;
-  int step = gridDim.x * blockDim.x;
-  while (gid < elements) {
-    ptr[gid] = static_cast<T>(0);
-    gid += step;
-  }
-}
-
-inline Shape GetShape5D(const Shape& shape, const std::string& data_format, int32_t dim) {
-  FixedDimVector shape_3d = {GetInDim(shape, data_format, 0, dim),
-                             GetInDim(shape, data_format, 1, dim),
-                             GetInDim(shape, data_format, 2, dim)};
-  return Shape({shape.At(0), shape.At(1), shape_3d.at(0), shape_3d.at(1), shape_3d.at(2)});
-}
-
-template<typename T>
-__global__ void AdaptiveAvgPoolCudaKernel(const T* input, T* output, int num_elems, int in_d,
-                                          int in_h, int in_w, int out_d, int out_h, int out_w) {
-  const int out_panel_size = out_d * out_h * out_w;
-  const int in_panel_size = in_d * in_h * in_w;
-
-  CUDA_1D_KERNEL_LOOP(idx, num_elems) {
-    // TODO (Tianyu): Replace following codes with 'NdIndexOffsetHelper'
-    int bc_idx = idx / out_panel_size;
-    int out_d_idx = (idx % out_panel_size) / out_w / out_h;
-    int out_h_idx = (idx % out_panel_size) % (out_h * out_w) / out_w;
-    int out_w_idx = (idx % out_panel_size) % (out_h * out_w) % out_w;
-
-    int in_start_d = START_IND(out_d_idx, out_d, in_d);
-    int in_end_d = END_IND(out_d_idx, out_d, in_d);
-    int k_d = in_end_d - in_start_d;
-
-    int in_start_h = START_IND(out_h_idx, out_h, in_h);
-    int in_end_h = END_IND(out_h_idx, out_h, in_h);
-    int k_h = in_end_h - in_start_h;
-
-    int in_start_w = START_IND(out_w_idx, out_w, in_w);
-    int in_end_w = END_IND(out_w_idx, out_w, in_w);
-    int k_w = in_end_w - in_start_w;
-
-    const T* in_ptr =
-        input + bc_idx * in_panel_size + in_start_d * in_h * in_w + in_start_h * in_w + in_start_w;
-    T sum = static_cast<T>(0);
-    for (int id = 0; id < k_d; ++id) {
-      for (int ih = 0; ih < k_h; ++ih) {
-        for (int iw = 0; iw < k_w; ++iw) {
-          T val = *(in_ptr + ih * in_w + iw);
-          sum += val;
-        }
-      }
-      in_ptr += in_h * in_w;  // next input depth
-    }
-    // Update output
-    output[idx] = sum / k_d / k_h / k_w;
-  }
-}
-
-template<typename T>
-__global__ void AdaptiveAvgPoolGradCudaKernel(T* input, const T* output, int num_elems, int in_d,
-                                              int in_h, int in_w, int out_d, int out_h, int out_w) {
-  const int out_panel_size = out_d * out_h * out_w;
-  const int in_panel_size = in_d * in_h * in_w;
-
-  CUDA_1D_KERNEL_LOOP(idx, num_elems) {
-    // TODO (Tianyu): Replace following codes with 'NdIndexOffsetHelper'
-    int bc_idx = idx / out_panel_size;
-    int out_d_idx = (idx % out_panel_size) / out_w / out_h;
-    int out_h_idx = (idx % out_panel_size) % (out_h * out_w) / out_w;
-    int out_w_idx = (idx % out_panel_size) % (out_h * out_w) % out_w;
-
-    int in_start_d = START_IND(out_d_idx, out_d, in_d);
-    int in_end_d = END_IND(out_d_idx, out_d, in_d);
-    int k_d = in_end_d - in_start_d;
-
-    int in_start_h = START_IND(out_h_idx, out_h, in_h);
-    int in_end_h = END_IND(out_h_idx, out_h, in_h);
-    int k_h = in_end_h - in_start_h;
-
-    int in_start_w = START_IND(out_w_idx, out_w, in_w);
-    int in_end_w = END_IND(out_w_idx, out_w, in_w);
-    int k_w = in_end_w - in_start_w;
-
-    const T grad_delta = output[idx] / k_d / k_h / k_w;
-    T* input_ptr =
-        input + bc_idx * in_panel_size + in_start_d * in_h * in_w + in_start_h * in_w + in_start_w;
-    for (int id = 0; id < k_d; ++id) {
-      for (int ih = 0; ih < k_h; ++ih) {
-        for (int iw = 0; iw < k_w; ++iw) {
-          // TODO (Tianyu): Use 'atmoic::Add' when necessary
-          cuda::atomic::Add(input_ptr + ih * in_w + iw, grad_delta);
-        }
-      }
-      input_ptr += in_h * in_w;  // next input depth
-    }
-  }
-}
-
-template<typename T>
-void AvgForwardCompute(KernelComputeContext* ctx, const int32_t& dim) {
-  const Tensor* in_tensor = ctx->Tensor4ArgNameAndIndex("x", 0);
-  Tensor* out_tensor = ctx->Tensor4ArgNameAndIndex("y", 0);
-  const T* in_ptr = in_tensor->dptr<T>();
-  T* out_ptr = out_tensor->mut_dptr<T>();
-
-  const Shape& x_shape = ctx->TensorDesc4ArgNameAndIndex("x", 0)->shape();
-  const Shape& y_shape = ctx->TensorDesc4ArgNameAndIndex("y", 0)->shape();
-
-  // TODO (Tianyu): Support 'channels_last'
-  std::string data_format = "channels_first";
-  const Shape& in = GetShape5D(x_shape, data_format, dim);
-  const Shape& out = GetShape5D(y_shape, data_format, dim);
-
-  const int out_elems = out_tensor->shape_view().elem_cnt();
-
-  RUN_CUDA_KERNEL((AdaptiveAvgPoolCudaKernel<T>), ctx->stream(), out_elems, in_ptr, out_ptr,
-                  out_elems, in.At(2), in.At(3), in.At(4), out.At(2), out.At(3), out.At(4));
-}
-
-template<typename T>
-void AvgBackwardCompute(KernelComputeContext* ctx, const int32_t& dim) {
-  const Tensor* out_tensor = ctx->Tensor4ArgNameAndIndex("dy", 0);
-  Tensor* in_tensor = ctx->Tensor4ArgNameAndIndex("dx", 0);
-  const T* out_ptr = out_tensor->dptr<T>();
-  T* in_ptr = in_tensor->mut_dptr<T>();
-
-  const Shape& dx_shape = ctx->TensorDesc4ArgNameAndIndex("dx", 0)->shape();
-  const Shape& dy_shape = ctx->TensorDesc4ArgNameAndIndex("dy", 0)->shape();
-
-  // TODO (Tianyu): Support 'channels_last'
-  std::string data_format = "channels_first";
-  const Shape& in = GetShape5D(dx_shape, data_format, dim);
-  const Shape& out = GetShape5D(dy_shape, data_format, dim);
-
-  const int in_elems = in_tensor->shape_view().elem_cnt();
-  const int out_elems = out_tensor->shape_view().elem_cnt();
-
-  RUN_CUDA_KERNEL((InitPtr<T>), ctx->stream(), in_elems, in_elems, in_ptr);
-  RUN_CUDA_KERNEL((AdaptiveAvgPoolGradCudaKernel<T>), ctx->stream(), out_elems, in_ptr, out_ptr,
-                  out_elems, in.At(2), in.At(3), in.At(4), out.At(2), out.At(3), out.At(4));
-}
-
-template<DeviceType device_type, typename T>
-class GpuAdaptiveAvgPool1dKernel final : public OpKernel {
- public:
-  GpuAdaptiveAvgPool1dKernel() = default;
-  ~GpuAdaptiveAvgPool1dKernel() = default;
-
- private:
-  using user_op::OpKernel::Compute;
-  void Compute(KernelComputeContext* ctx) const override { AvgForwardCompute<T>(ctx, 1); }
-
-  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
-};
-
-template<DeviceType device_type, typename T>
-class GpuAdaptiveAvgPool2dKernel final : public OpKernel {
- public:
-  GpuAdaptiveAvgPool2dKernel() = default;
-  ~GpuAdaptiveAvgPool2dKernel() = default;
-
- private:
-  using user_op::OpKernel::Compute;
-  void Compute(KernelComputeContext* ctx) const override { AvgForwardCompute<T>(ctx, 2); }
-
-  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
-};
-
-template<DeviceType device_type, typename T>
-class GpuAdaptiveAvgPool3dKernel final : public OpKernel {
- public:
-  GpuAdaptiveAvgPool3dKernel() = default;
-  ~GpuAdaptiveAvgPool3dKernel() = default;
-
- private:
-  using user_op::OpKernel::Compute;
-  void Compute(KernelComputeContext* ctx) const override { AvgForwardCompute<T>(ctx, 3); }
-
-  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
-};
-
-template<DeviceType device_type, typename T>
-class GpuAdaptiveAvgPool1dGradKernel final : public OpKernel {
- public:
-  GpuAdaptiveAvgPool1dGradKernel() = default;
-  ~GpuAdaptiveAvgPool1dGradKernel() = default;
-
- private:
-  using user_op::OpKernel::Compute;
-  void Compute(KernelComputeContext* ctx) const override { AvgBackwardCompute<T>(ctx, 1); }
-  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
-};
-
-template<DeviceType device_type, typename T>
-class GpuAdaptiveAvgPool2dGradKernel final : public OpKernel {
- public:
-  GpuAdaptiveAvgPool2dGradKernel() = default;
-  ~GpuAdaptiveAvgPool2dGradKernel() = default;
-
- private:
-  using user_op::OpKernel::Compute;
-  void Compute(KernelComputeContext* ctx) const override { AvgBackwardCompute<T>(ctx, 2); }
-  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
-};
-
-template<DeviceType device_type, typename T>
-class GpuAdaptiveAvgPool3dGradKernel final : public OpKernel {
- public:
-  GpuAdaptiveAvgPool3dGradKernel() = default;
-  ~GpuAdaptiveAvgPool3dGradKernel() = default;
-
- private:
-  using user_op::OpKernel::Compute;
-  void Compute(KernelComputeContext* ctx) const override { AvgBackwardCompute<T>(ctx, 3); }
-  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
-};
-
-#define REGISTER_CUDA_ADAPTIVE_AVGPOOL_KERNEL(device, dtype)                   \
-  REGISTER_USER_KERNEL("adaptive_avg_pool1d")                                  \
-      .SetCreateFn<GpuAdaptiveAvgPool1dKernel<device, dtype>>()                \
-      .SetIsMatchedHob((HobDeviceType() == device)                             \
-                       && (HobDataType("y", 0) == GetDataType<dtype>::value)); \
-  REGISTER_USER_KERNEL("adaptive_avg_pool2d")                                  \
-      .SetCreateFn<GpuAdaptiveAvgPool2dKernel<device, dtype>>()                \
-      .SetIsMatchedHob((HobDeviceType() == device)                             \
-                       && (HobDataType("y", 0) == GetDataType<dtype>::value)); \
-  REGISTER_USER_KERNEL("adaptive_avg_pool3d")                                  \
-      .SetCreateFn<GpuAdaptiveAvgPool3dKernel<device, dtype>>()                \
-      .SetIsMatchedHob((HobDeviceType() == device)                             \
-                       && (HobDataType("y", 0) == GetDataType<dtype>::value));
-
-REGISTER_CUDA_ADAPTIVE_AVGPOOL_KERNEL(DeviceType::kCUDA, float);
-REGISTER_CUDA_ADAPTIVE_AVGPOOL_KERNEL(DeviceType::kCUDA, double);
-REGISTER_CUDA_ADAPTIVE_AVGPOOL_KERNEL(DeviceType::kCUDA, int);
-
-#define REGISTER_CUDA_ADAPTIVE_AVGPOOL_BACKWARD_KERNEL(device, dtype)           \
-  REGISTER_USER_KERNEL("adaptive_avg_pool1d_grad")                              \
-      .SetCreateFn<GpuAdaptiveAvgPool1dGradKernel<device, dtype>>()             \
-      .SetIsMatchedHob((HobDeviceType() == device)                              \
-                       && (HobDataType("dx", 0) == GetDataType<dtype>::value)); \
-  REGISTER_USER_KERNEL("adaptive_avg_pool2d_grad")                              \
-      .SetCreateFn<GpuAdaptiveAvgPool2dGradKernel<device, dtype>>()             \
-      .SetIsMatchedHob((HobDeviceType() == device)                              \
-                       && (HobDataType("dx", 0) == GetDataType<dtype>::value)); \
-  REGISTER_USER_KERNEL("adaptive_avg_pool3d_grad")                              \
-      .SetCreateFn<GpuAdaptiveAvgPool3dGradKernel<device, dtype>>()             \
-      .SetIsMatchedHob((HobDeviceType() == device)                              \
-                       && (HobDataType("dx", 0) == GetDataType<dtype>::value));
-
-REGISTER_CUDA_ADAPTIVE_AVGPOOL_BACKWARD_KERNEL(DeviceType::kCUDA, float);
-REGISTER_CUDA_ADAPTIVE_AVGPOOL_BACKWARD_KERNEL(DeviceType::kCUDA, double);
-REGISTER_CUDA_ADAPTIVE_AVGPOOL_BACKWARD_KERNEL(DeviceType::kCUDA, int);
-
-}  // namespace user_op
-
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "hip/hip_runtime.h"
+#include "oneflow/core/device/cuda_util.h"
+#include "oneflow/core/framework/framework.h"
+#include "oneflow/core/kernel/kernel_util.hip.h"
+#include "oneflow/core/common/data_type.h"
+#include "oneflow/core/kernel/util/cuda_half_util.h"
+#include "oneflow/core/hip/atomic.hip.h"
+#include "oneflow/core/operator/operator_util.h"
+#include "oneflow/user/utils/pool_util.h"
+
+#include <algorithm>
+#include <cfloat>
+#include <cmath>
+
+namespace oneflow {
+
+namespace user_op {
+
+#define START_IND(a, b, c) (int)std::floor((float)(a * c) / b)
+#define END_IND(a, b, c) (int)std::ceil((float)((a + 1) * c) / b)
+
+#define START_IND_INT(a, b, c) ((a * c) / b)
+#define END_IND_INT(a, b, c) (((a + 1) * c + b - 1) / b)
+
+template<typename T>
+__global__ void InitPtr(int elements, T* ptr) {
+  int gid = (blockDim.x * blockIdx.x) + threadIdx.x;
+  int step = gridDim.x * blockDim.x;
+  while (gid < elements) {
+    ptr[gid] = static_cast<T>(0);
+    gid += step;
+  }
+}
+
+inline Shape GetShape5D(const Shape& shape, const std::string& data_format, int32_t dim) {
+  FixedDimVector shape_3d = {GetInDim(shape, data_format, 0, dim),
+                             GetInDim(shape, data_format, 1, dim),
+                             GetInDim(shape, data_format, 2, dim)};
+  return Shape({shape.At(0), shape.At(1), shape_3d.at(0), shape_3d.at(1), shape_3d.at(2)});
+}
+
+template<typename T>
+__global__ void AdaptiveAvgPoolCudaKernel(const T* input, T* output, int num_elems, int in_d,
+                                          int in_h, int in_w, int out_d, int out_h, int out_w) {
+  const int out_panel_size = out_d * out_h * out_w;
+  const int in_panel_size = in_d * in_h * in_w;
+
+  CUDA_1D_KERNEL_LOOP(idx, num_elems) {
+    // TODO (Tianyu): Replace following codes with 'NdIndexOffsetHelper'
+    int bc_idx = idx / out_panel_size;
+    int out_d_idx = (idx % out_panel_size) / out_w / out_h;
+    int out_h_idx = (idx % out_panel_size) % (out_h * out_w) / out_w;
+    int out_w_idx = (idx % out_panel_size) % (out_h * out_w) % out_w;
+
+    int in_start_d = START_IND(out_d_idx, out_d, in_d);
+    int in_end_d = END_IND(out_d_idx, out_d, in_d);
+    int k_d = in_end_d - in_start_d;
+
+    int in_start_h = START_IND(out_h_idx, out_h, in_h);
+    int in_end_h = END_IND(out_h_idx, out_h, in_h);
+    int k_h = in_end_h - in_start_h;
+
+    int in_start_w = START_IND(out_w_idx, out_w, in_w);
+    int in_end_w = END_IND(out_w_idx, out_w, in_w);
+    int k_w = in_end_w - in_start_w;
+
+    const T* in_ptr =
+        input + bc_idx * in_panel_size + in_start_d * in_h * in_w + in_start_h * in_w + in_start_w;
+    T sum = static_cast<T>(0);
+    for (int id = 0; id < k_d; ++id) {
+      for (int ih = 0; ih < k_h; ++ih) {
+        for (int iw = 0; iw < k_w; ++iw) {
+          T val = *(in_ptr + ih * in_w + iw);
+          sum += val;
+        }
+      }
+      in_ptr += in_h * in_w;  // next input depth
+    }
+    // Update output
+    output[idx] = sum / k_d / k_h / k_w;
+  }
+}
+
+template<typename T>
+__global__ void AdaptiveAvgPoolGradCudaKernel(T* input, const T* output, int num_elems, int in_d,
+                                              int in_h, int in_w, int out_d, int out_h, int out_w) {
+  const int out_panel_size = out_d * out_h * out_w;
+  const int in_panel_size = in_d * in_h * in_w;
+
+  CUDA_1D_KERNEL_LOOP(idx, num_elems) {
+    // TODO (Tianyu): Replace following codes with 'NdIndexOffsetHelper'
+    int bc_idx = idx / out_panel_size;
+    int out_d_idx = (idx % out_panel_size) / out_w / out_h;
+    int out_h_idx = (idx % out_panel_size) % (out_h * out_w) / out_w;
+    int out_w_idx = (idx % out_panel_size) % (out_h * out_w) % out_w;
+
+    int in_start_d = START_IND(out_d_idx, out_d, in_d);
+    int in_end_d = END_IND(out_d_idx, out_d, in_d);
+    int k_d = in_end_d - in_start_d;
+
+    int in_start_h = START_IND(out_h_idx, out_h, in_h);
+    int in_end_h = END_IND(out_h_idx, out_h, in_h);
+    int k_h = in_end_h - in_start_h;
+
+    int in_start_w = START_IND(out_w_idx, out_w, in_w);
+    int in_end_w = END_IND(out_w_idx, out_w, in_w);
+    int k_w = in_end_w - in_start_w;
+
+    const T grad_delta = output[idx] / k_d / k_h / k_w;
+    T* input_ptr =
+        input + bc_idx * in_panel_size + in_start_d * in_h * in_w + in_start_h * in_w + in_start_w;
+    for (int id = 0; id < k_d; ++id) {
+      for (int ih = 0; ih < k_h; ++ih) {
+        for (int iw = 0; iw < k_w; ++iw) {
+          // TODO (Tianyu): Use 'atmoic::Add' when necessary
+          cuda::atomic::Add(input_ptr + ih * in_w + iw, grad_delta);
+        }
+      }
+      input_ptr += in_h * in_w;  // next input depth
+    }
+  }
+}
+
+template<typename T>
+void AvgForwardCompute(KernelComputeContext* ctx, const int32_t& dim) {
+  const Tensor* in_tensor = ctx->Tensor4ArgNameAndIndex("x", 0);
+  Tensor* out_tensor = ctx->Tensor4ArgNameAndIndex("y", 0);
+  const T* in_ptr = in_tensor->dptr<T>();
+  T* out_ptr = out_tensor->mut_dptr<T>();
+
+  const Shape& x_shape = ctx->TensorDesc4ArgNameAndIndex("x", 0)->shape();
+  const Shape& y_shape = ctx->TensorDesc4ArgNameAndIndex("y", 0)->shape();
+
+  // TODO (Tianyu): Support 'channels_last'
+  std::string data_format = "channels_first";
+  const Shape& in = GetShape5D(x_shape, data_format, dim);
+  const Shape& out = GetShape5D(y_shape, data_format, dim);
+
+  const int out_elems = out_tensor->shape_view().elem_cnt();
+
+  RUN_CUDA_KERNEL((AdaptiveAvgPoolCudaKernel<T>), ctx->stream(), out_elems, in_ptr, out_ptr,
+                  out_elems, in.At(2), in.At(3), in.At(4), out.At(2), out.At(3), out.At(4));
+}
+
+template<typename T>
+void AvgBackwardCompute(KernelComputeContext* ctx, const int32_t& dim) {
+  const Tensor* out_tensor = ctx->Tensor4ArgNameAndIndex("dy", 0);
+  Tensor* in_tensor = ctx->Tensor4ArgNameAndIndex("dx", 0);
+  const T* out_ptr = out_tensor->dptr<T>();
+  T* in_ptr = in_tensor->mut_dptr<T>();
+
+  const Shape& dx_shape = ctx->TensorDesc4ArgNameAndIndex("dx", 0)->shape();
+  const Shape& dy_shape = ctx->TensorDesc4ArgNameAndIndex("dy", 0)->shape();
+
+  // TODO (Tianyu): Support 'channels_last'
+  std::string data_format = "channels_first";
+  const Shape& in = GetShape5D(dx_shape, data_format, dim);
+  const Shape& out = GetShape5D(dy_shape, data_format, dim);
+
+  const int in_elems = in_tensor->shape_view().elem_cnt();
+  const int out_elems = out_tensor->shape_view().elem_cnt();
+
+  RUN_CUDA_KERNEL((InitPtr<T>), ctx->stream(), in_elems, in_elems, in_ptr);
+  RUN_CUDA_KERNEL((AdaptiveAvgPoolGradCudaKernel<T>), ctx->stream(), out_elems, in_ptr, out_ptr,
+                  out_elems, in.At(2), in.At(3), in.At(4), out.At(2), out.At(3), out.At(4));
+}
+
+template<DeviceType device_type, typename T>
+class GpuAdaptiveAvgPool1dKernel final : public OpKernel {
+ public:
+  GpuAdaptiveAvgPool1dKernel() = default;
+  ~GpuAdaptiveAvgPool1dKernel() = default;
+
+ private:
+  using user_op::OpKernel::Compute;
+  void Compute(KernelComputeContext* ctx) const override { AvgForwardCompute<T>(ctx, 1); }
+
+  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
+};
+
+template<DeviceType device_type, typename T>
+class GpuAdaptiveAvgPool2dKernel final : public OpKernel {
+ public:
+  GpuAdaptiveAvgPool2dKernel() = default;
+  ~GpuAdaptiveAvgPool2dKernel() = default;
+
+ private:
+  using user_op::OpKernel::Compute;
+  void Compute(KernelComputeContext* ctx) const override { AvgForwardCompute<T>(ctx, 2); }
+
+  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
+};
+
+template<DeviceType device_type, typename T>
+class GpuAdaptiveAvgPool3dKernel final : public OpKernel {
+ public:
+  GpuAdaptiveAvgPool3dKernel() = default;
+  ~GpuAdaptiveAvgPool3dKernel() = default;
+
+ private:
+  using user_op::OpKernel::Compute;
+  void Compute(KernelComputeContext* ctx) const override { AvgForwardCompute<T>(ctx, 3); }
+
+  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
+};
+
+template<DeviceType device_type, typename T>
+class GpuAdaptiveAvgPool1dGradKernel final : public OpKernel {
+ public:
+  GpuAdaptiveAvgPool1dGradKernel() = default;
+  ~GpuAdaptiveAvgPool1dGradKernel() = default;
+
+ private:
+  using user_op::OpKernel::Compute;
+  void Compute(KernelComputeContext* ctx) const override { AvgBackwardCompute<T>(ctx, 1); }
+  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
+};
+
+template<DeviceType device_type, typename T>
+class GpuAdaptiveAvgPool2dGradKernel final : public OpKernel {
+ public:
+  GpuAdaptiveAvgPool2dGradKernel() = default;
+  ~GpuAdaptiveAvgPool2dGradKernel() = default;
+
+ private:
+  using user_op::OpKernel::Compute;
+  void Compute(KernelComputeContext* ctx) const override { AvgBackwardCompute<T>(ctx, 2); }
+  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
+};
+
+template<DeviceType device_type, typename T>
+class GpuAdaptiveAvgPool3dGradKernel final : public OpKernel {
+ public:
+  GpuAdaptiveAvgPool3dGradKernel() = default;
+  ~GpuAdaptiveAvgPool3dGradKernel() = default;
+
+ private:
+  using user_op::OpKernel::Compute;
+  void Compute(KernelComputeContext* ctx) const override { AvgBackwardCompute<T>(ctx, 3); }
+  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
+};
+
+#define REGISTER_CUDA_ADAPTIVE_AVGPOOL_KERNEL(device, dtype)                   \
+  REGISTER_USER_KERNEL("adaptive_avg_pool1d")                                  \
+      .SetCreateFn<GpuAdaptiveAvgPool1dKernel<device, dtype>>()                \
+      .SetIsMatchedHob((HobDeviceType() == device)                             \
+                       && (HobDataType("y", 0) == GetDataType<dtype>::value)); \
+  REGISTER_USER_KERNEL("adaptive_avg_pool2d")                                  \
+      .SetCreateFn<GpuAdaptiveAvgPool2dKernel<device, dtype>>()                \
+      .SetIsMatchedHob((HobDeviceType() == device)                             \
+                       && (HobDataType("y", 0) == GetDataType<dtype>::value)); \
+  REGISTER_USER_KERNEL("adaptive_avg_pool3d")                                  \
+      .SetCreateFn<GpuAdaptiveAvgPool3dKernel<device, dtype>>()                \
+      .SetIsMatchedHob((HobDeviceType() == device)                             \
+                       && (HobDataType("y", 0) == GetDataType<dtype>::value));
+
+REGISTER_CUDA_ADAPTIVE_AVGPOOL_KERNEL(DeviceType::kCUDA, float);
+REGISTER_CUDA_ADAPTIVE_AVGPOOL_KERNEL(DeviceType::kCUDA, double);
+REGISTER_CUDA_ADAPTIVE_AVGPOOL_KERNEL(DeviceType::kCUDA, int);
+
+#define REGISTER_CUDA_ADAPTIVE_AVGPOOL_BACKWARD_KERNEL(device, dtype)           \
+  REGISTER_USER_KERNEL("adaptive_avg_pool1d_grad")                              \
+      .SetCreateFn<GpuAdaptiveAvgPool1dGradKernel<device, dtype>>()             \
+      .SetIsMatchedHob((HobDeviceType() == device)                              \
+                       && (HobDataType("dx", 0) == GetDataType<dtype>::value)); \
+  REGISTER_USER_KERNEL("adaptive_avg_pool2d_grad")                              \
+      .SetCreateFn<GpuAdaptiveAvgPool2dGradKernel<device, dtype>>()             \
+      .SetIsMatchedHob((HobDeviceType() == device)                              \
+                       && (HobDataType("dx", 0) == GetDataType<dtype>::value)); \
+  REGISTER_USER_KERNEL("adaptive_avg_pool3d_grad")                              \
+      .SetCreateFn<GpuAdaptiveAvgPool3dGradKernel<device, dtype>>()             \
+      .SetIsMatchedHob((HobDeviceType() == device)                              \
+                       && (HobDataType("dx", 0) == GetDataType<dtype>::value));
+
+REGISTER_CUDA_ADAPTIVE_AVGPOOL_BACKWARD_KERNEL(DeviceType::kCUDA, float);
+REGISTER_CUDA_ADAPTIVE_AVGPOOL_BACKWARD_KERNEL(DeviceType::kCUDA, double);
+REGISTER_CUDA_ADAPTIVE_AVGPOOL_BACKWARD_KERNEL(DeviceType::kCUDA, int);
+
+}  // namespace user_op
+
 }  // namespace oneflow
\ No newline at end of file
--- a/oneflow/user/kernels/affine_grid_kernel.hip.cpp
+++ b/oneflow/user/kernels/affine_grid_kernel.hip.cpp
-
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#include "hip/hip_runtime.h"
-#include "oneflow/core/kernel/new_kernel_util.h"
-#include "oneflow/core/kernel/kernel_util.h"
-#include "oneflow/core/device/cuda_util.h"
-#include "affine_grid_kernel.h"
-
-namespace oneflow {
-
-namespace {
-
-template<typename data_type, bool align_corners>
-OF_DEVICE_FUNC data_type LinspaceGPU(int32_t index, int32_t num_steps) {
-  if (num_steps <= 1) { return static_cast<data_type>(0.0); }
-
-  if (align_corners) {
-    return static_cast<data_type>(-1.0 + 2.0 / (num_steps - 1) * index);
-  } else {
-    return static_cast<data_type>((-1.0 + 2.0 / (num_steps - 1) * index) * (num_steps - 1)
-                                  / num_steps);
-  }
-}
-
-template<typename data_type, bool align_corners>
-__global__ void Generate2DBaseGridGPUKernel(const int32_t nthreads, data_type* grid_ptr, int32_t H,
-                                            int32_t W) {
-  CUDA_1D_KERNEL_LOOP(index, nthreads) {
-    const int32_t h = index / W;
-    const int32_t w = index % W;
-    const int32_t pixel_length = 3;
-    data_type* row_ptr = grid_ptr + h * W * pixel_length;
-    data_type* pixel_ptr = row_ptr + w * pixel_length;
-    data_type h_value = LinspaceGPU<data_type, align_corners>(h, H);
-    data_type w_value = LinspaceGPU<data_type, align_corners>(w, W);
-
-    pixel_ptr[0] = w_value;
-    pixel_ptr[1] = h_value;
-    pixel_ptr[2] = static_cast<data_type>(1.0);
-  }
-}
-
-template<typename data_type, bool align_corners>
-__global__ void Generate3DBaseGridGPUKernel(const int32_t nthreads, data_type* grid_ptr, int32_t D,
-                                            int32_t H, int32_t W) {
-  CUDA_1D_KERNEL_LOOP(index, nthreads) {
-    const int32_t d = index / H;
-    const int32_t h = index % H;
-    const int32_t pixel_length = 4;
-    data_type* image_ptr = grid_ptr + d * H * W * pixel_length;
-    data_type* row_ptr = image_ptr + h * W * pixel_length;
-    data_type d_value = LinspaceGPU<data_type, align_corners>(d, D);
-    data_type h_value = LinspaceGPU<data_type, align_corners>(h, H);
-
-    for (int32_t w = 0; w < W; ++w) {
-      data_type* pixel_ptr = row_ptr + w * pixel_length;
-      data_type w_value = LinspaceGPU<data_type, align_corners>(w, W);
-      pixel_ptr[0] = w_value;
-      pixel_ptr[1] = h_value;
-      pixel_ptr[2] = d_value;
-      pixel_ptr[3] = static_cast<data_type>(1.0);
-    }
-  }
-}
-
-}  // namespace
-
-void GenerateBaseGridImp<DeviceType::kCUDA>::Generate2D(user_op::KernelComputeContext* ctx,
-                                                        float* grid_ptr, int64_t H, int64_t W,
-                                                        bool align_corners) {
-  int count = H * W;
-  if (align_corners) {
-    RUN_CUDA_KERNEL((Generate2DBaseGridGPUKernel<float, true>), ctx->stream(), count, count,
-                    grid_ptr, H, W);
-  } else {
-    RUN_CUDA_KERNEL((Generate2DBaseGridGPUKernel<float, false>), ctx->stream(), count, count,
-                    grid_ptr, H, W);
-  }
-}
-void GenerateBaseGridImp<DeviceType::kCUDA>::Generate2D(user_op::KernelComputeContext* ctx,
-                                                        double* grid_ptr, int64_t H, int64_t W,
-                                                        bool align_corners) {
-  int count = H * W;
-  if (align_corners) {
-    RUN_CUDA_KERNEL((Generate2DBaseGridGPUKernel<double, true>), ctx->stream(), count, count,
-                    grid_ptr, H, W);
-  } else {
-    RUN_CUDA_KERNEL((Generate2DBaseGridGPUKernel<double, false>), ctx->stream(), count, count,
-                    grid_ptr, H, W);
-  }
-}
-
-void GenerateBaseGridImp<DeviceType::kCUDA>::Generate3D(user_op::KernelComputeContext* ctx,
-                                                        float* grid_ptr, int64_t D, int64_t H,
-                                                        int64_t W, bool align_corners) {
-  int count = D * H;
-  if (align_corners) {
-    RUN_CUDA_KERNEL((Generate3DBaseGridGPUKernel<float, true>), ctx->stream(), count, count,
-                    grid_ptr, D, H, W);
-  } else {
-    RUN_CUDA_KERNEL((Generate3DBaseGridGPUKernel<float, false>), ctx->stream(), count, count,
-                    grid_ptr, D, H, W);
-  }
-}
-
-void GenerateBaseGridImp<DeviceType::kCUDA>::Generate3D(user_op::KernelComputeContext* ctx,
-                                                        double* grid_ptr, int64_t D, int64_t H,
-                                                        int64_t W, bool align_corners) {
-  int count = D * H;
-  if (align_corners) {
-    RUN_CUDA_KERNEL((Generate3DBaseGridGPUKernel<double, true>), ctx->stream(), count, count,
-                    grid_ptr, D, H, W);
-  } else {
-    RUN_CUDA_KERNEL((Generate3DBaseGridGPUKernel<double, false>), ctx->stream(), count, count,
-                    grid_ptr, D, H, W);
-  }
-}
-
+
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "hip/hip_runtime.h"
+#include "oneflow/core/kernel/new_kernel_util.h"
+#include "oneflow/core/kernel/kernel_util.h"
+#include "oneflow/core/device/cuda_util.h"
+#include "affine_grid_kernel.h"
+
+namespace oneflow {
+
+namespace {
+
+template<typename data_type, bool align_corners>
+OF_DEVICE_FUNC data_type LinspaceGPU(int32_t index, int32_t num_steps) {
+  if (num_steps <= 1) { return static_cast<data_type>(0.0); }
+
+  if (align_corners) {
+    return static_cast<data_type>(-1.0 + 2.0 / (num_steps - 1) * index);
+  } else {
+    return static_cast<data_type>((-1.0 + 2.0 / (num_steps - 1) * index) * (num_steps - 1)
+                                  / num_steps);
+  }
+}
+
+template<typename data_type, bool align_corners>
+__global__ void Generate2DBaseGridGPUKernel(const int32_t nthreads, data_type* grid_ptr, int32_t H,
+                                            int32_t W) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    const int32_t h = index / W;
+    const int32_t w = index % W;
+    const int32_t pixel_length = 3;
+    data_type* row_ptr = grid_ptr + h * W * pixel_length;
+    data_type* pixel_ptr = row_ptr + w * pixel_length;
+    data_type h_value = LinspaceGPU<data_type, align_corners>(h, H);
+    data_type w_value = LinspaceGPU<data_type, align_corners>(w, W);
+
+    pixel_ptr[0] = w_value;
+    pixel_ptr[1] = h_value;
+    pixel_ptr[2] = static_cast<data_type>(1.0);
+  }
+}
+
+template<typename data_type, bool align_corners>
+__global__ void Generate3DBaseGridGPUKernel(const int32_t nthreads, data_type* grid_ptr, int32_t D,
+                                            int32_t H, int32_t W) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    const int32_t d = index / H;
+    const int32_t h = index % H;
+    const int32_t pixel_length = 4;
+    data_type* image_ptr = grid_ptr + d * H * W * pixel_length;
+    data_type* row_ptr = image_ptr + h * W * pixel_length;
+    data_type d_value = LinspaceGPU<data_type, align_corners>(d, D);
+    data_type h_value = LinspaceGPU<data_type, align_corners>(h, H);
+
+    for (int32_t w = 0; w < W; ++w) {
+      data_type* pixel_ptr = row_ptr + w * pixel_length;
+      data_type w_value = LinspaceGPU<data_type, align_corners>(w, W);
+      pixel_ptr[0] = w_value;
+      pixel_ptr[1] = h_value;
+      pixel_ptr[2] = d_value;
+      pixel_ptr[3] = static_cast<data_type>(1.0);
+    }
+  }
+}
+
+}  // namespace
+
+void GenerateBaseGridImp<DeviceType::kCUDA>::Generate2D(user_op::KernelComputeContext* ctx,
+                                                        float* grid_ptr, int64_t H, int64_t W,
+                                                        bool align_corners) {
+  int count = H * W;
+  if (align_corners) {
+    RUN_CUDA_KERNEL((Generate2DBaseGridGPUKernel<float, true>), ctx->stream(), count, count,
+                    grid_ptr, H, W);
+  } else {
+    RUN_CUDA_KERNEL((Generate2DBaseGridGPUKernel<float, false>), ctx->stream(), count, count,
+                    grid_ptr, H, W);
+  }
+}
+void GenerateBaseGridImp<DeviceType::kCUDA>::Generate2D(user_op::KernelComputeContext* ctx,
+                                                        double* grid_ptr, int64_t H, int64_t W,
+                                                        bool align_corners) {
+  int count = H * W;
+  if (align_corners) {
+    RUN_CUDA_KERNEL((Generate2DBaseGridGPUKernel<double, true>), ctx->stream(), count, count,
+                    grid_ptr, H, W);
+  } else {
+    RUN_CUDA_KERNEL((Generate2DBaseGridGPUKernel<double, false>), ctx->stream(), count, count,
+                    grid_ptr, H, W);
+  }
+}
+
+void GenerateBaseGridImp<DeviceType::kCUDA>::Generate3D(user_op::KernelComputeContext* ctx,
+                                                        float* grid_ptr, int64_t D, int64_t H,
+                                                        int64_t W, bool align_corners) {
+  int count = D * H;
+  if (align_corners) {
+    RUN_CUDA_KERNEL((Generate3DBaseGridGPUKernel<float, true>), ctx->stream(), count, count,
+                    grid_ptr, D, H, W);
+  } else {
+    RUN_CUDA_KERNEL((Generate3DBaseGridGPUKernel<float, false>), ctx->stream(), count, count,
+                    grid_ptr, D, H, W);
+  }
+}
+
+void GenerateBaseGridImp<DeviceType::kCUDA>::Generate3D(user_op::KernelComputeContext* ctx,
+                                                        double* grid_ptr, int64_t D, int64_t H,
+                                                        int64_t W, bool align_corners) {
+  int count = D * H;
+  if (align_corners) {
+    RUN_CUDA_KERNEL((Generate3DBaseGridGPUKernel<double, true>), ctx->stream(), count, count,
+                    grid_ptr, D, H, W);
+  } else {
+    RUN_CUDA_KERNEL((Generate3DBaseGridGPUKernel<double, false>), ctx->stream(), count, count,
+                    grid_ptr, D, H, W);
+  }
+}
+
 }  // namespace oneflow
\ No newline at end of file
--- a/oneflow/user/kernels/arange_kernel_util.hip.cpp
+++ b/oneflow/user/kernels/arange_kernel_util.hip.cpp
-
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#ifdef WITH_ROCM
-#include "hip/hip_runtime.h"
-#include "oneflow/core/framework/framework.h"
-#include "oneflow/user/kernels/arange_kernel_util.h"
-
-namespace oneflow {
-
-namespace user_op {
-
-template<typename T>
-__global__ void ArangeForwardGpuKernel(const T start, const T delta, const int64_t arange_elem_cnt,
-                                       T* out) {
-  // Use Loop to set the value
-  DoArange<T>(start, delta, arange_elem_cnt, out);
-}
-
-template<typename T>
-struct ArangeFunctor<DeviceType::kCUDA, T> final {
-  void operator()(ep::Stream* stream, const T start, const T delta, const int64_t arange_elem_cnt,
-                  T* out) {
-    // The thread num is set as arange_elem_cnt
-    RUN_CUDA_KERNEL((ArangeForwardGpuKernel<T>), stream, arange_elem_cnt, start, delta,
-                    arange_elem_cnt, out);
-  }
-};
-
-OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INSTANTIATE_ARANGE_FUNCTOR, (DeviceType::kCUDA),
-                                 ARANGE_DATA_TYPE_SEQ);
-}  // namespace user_op
-}  // namespace oneflow
-
+
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#ifdef WITH_ROCM
+#include "hip/hip_runtime.h"
+#include "oneflow/core/framework/framework.h"
+#include "oneflow/user/kernels/arange_kernel_util.h"
+
+namespace oneflow {
+
+namespace user_op {
+
+template<typename T>
+__global__ void ArangeForwardGpuKernel(const T start, const T delta, const int64_t arange_elem_cnt,
+                                       T* out) {
+  // Use Loop to set the value
+  DoArange<T>(start, delta, arange_elem_cnt, out);
+}
+
+template<typename T>
+struct ArangeFunctor<DeviceType::kCUDA, T> final {
+  void operator()(ep::Stream* stream, const T start, const T delta, const int64_t arange_elem_cnt,
+                  T* out) {
+    // The thread num is set as arange_elem_cnt
+    RUN_CUDA_KERNEL((ArangeForwardGpuKernel<T>), stream, arange_elem_cnt, start, delta,
+                    arange_elem_cnt, out);
+  }
+};
+
+OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INSTANTIATE_ARANGE_FUNCTOR, (DeviceType::kCUDA),
+                                 ARANGE_DATA_TYPE_SEQ);
+}  // namespace user_op
+}  // namespace oneflow
+
 #endif  // End WITH_ROCM
\ No newline at end of file
--- a/oneflow/user/kernels/arg_sort_kernel.hip.cpp
+++ b/oneflow/user/kernels/arg_sort_kernel.hip.cpp
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#include "hip/hip_runtime.h"
-#include "oneflow/core/framework/framework.h"
-#include "oneflow/core/kernel/new_kernel_util.h"
-#include "oneflow/user/kernels/radix_sort.hip.h"
-#include "oneflow/core/ep/rocm/cuda_stream.h"
-
-namespace oneflow {
-
-namespace {
-
-template<typename T>
-class TmpBufferManager final {
- public:
-  OF_DISALLOW_COPY_AND_MOVE(TmpBufferManager);
-  TmpBufferManager(int32_t capacity, void* ptr, const ShapeView& in_shape)
-      : capacity_{capacity},
-        sorted_in_elem_cnt_{in_shape.elem_cnt()},
-        indices_elem_cnt_{sorted_in_elem_cnt_} {
-    const int32_t sorted_in_aligned_bytes = GetCudaAlignedSize(sorted_in_elem_cnt_ * sizeof(T));
-    const int32_t indices_aligned_bytes = GetCudaAlignedSize(indices_elem_cnt_ * sizeof(int32_t));
-    sorted_in_ptr_ = reinterpret_cast<T*>(ptr);
-    indices_ptr_ = reinterpret_cast<int32_t*>(reinterpret_cast<char*>(sorted_in_ptr_)
-                                              + sorted_in_aligned_bytes);
-    temp_storage_ptr_ =
-        reinterpret_cast<void*>(reinterpret_cast<char*>(indices_ptr_) + indices_aligned_bytes);
-    temp_storage_bytes_ = capacity_ - sorted_in_aligned_bytes - indices_aligned_bytes;
-    CHECK_GE(temp_storage_bytes_, 0);
-  }
-  ~TmpBufferManager() = default;
-
-  T* SortedInPtr() const { return sorted_in_ptr_; }
-  int32_t* IndicesPtr() const { return indices_ptr_; }
-  void* TempStoragePtr() const { return temp_storage_ptr_; }
-
-  int32_t TempStorageBytes() const { return temp_storage_bytes_; }
-
- private:
-  int32_t capacity_;
-
-  T* sorted_in_ptr_;
-  int32_t* indices_ptr_;
-  void* temp_storage_ptr_;
-
-  int64_t sorted_in_elem_cnt_;
-  int64_t indices_elem_cnt_;
-  int32_t temp_storage_bytes_;
-};
-
-__global__ void InitializeIndices(int32_t elem_cnt, int32_t* indices_ptr, int32_t instance_size) {
-  CUDA_1D_KERNEL_LOOP(i, elem_cnt) { indices_ptr[i] = i % instance_size; };
-}
-
-}  // namespace
-
-template<typename T>
-class GpuArgSortKernel final : public user_op::OpKernel {
- public:
-  GpuArgSortKernel() = default;
-  ~GpuArgSortKernel() = default;
-
- private:
-  using user_op::OpKernel::Compute;
-  void Compute(user_op::KernelComputeContext* ctx) const override {
-    const user_op::Tensor* in = ctx->Tensor4ArgNameAndIndex("in", 0);
-    user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0);
-    user_op::Tensor* tmp_buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0);
-    TmpBufferManager<T> buf_manager(static_cast<int32_t>(tmp_buffer->shape_view().elem_cnt()),
-                                    tmp_buffer->mut_dptr<void>(), in->shape_view());
-
-    const int32_t elem_cnt = in->shape_view().elem_cnt();
-    const int32_t instance_size = in->shape_view().At(in->shape_view().NumAxes() - 1);
-    const int32_t instance_num = elem_cnt / instance_size;
-    const std::string& direction = ctx->Attr<std::string>("direction");
-    InitializeIndices<<<BlocksNum4ThreadsNum(elem_cnt), kCudaThreadsNumPerBlock, 0,
-                        ctx->stream()->As<ep::CudaStream>()->cuda_stream()>>>(
-        elem_cnt, buf_manager.IndicesPtr(), instance_size);
-    if (direction == "ASCENDING") {
-      SortPairsAscending(in->dptr<T>(), buf_manager.IndicesPtr(), instance_num, instance_size,
-                         buf_manager.TempStoragePtr(), buf_manager.TempStorageBytes(),
-                         buf_manager.SortedInPtr(), out->mut_dptr<int32_t>(),
-                         ctx->stream()->As<ep::CudaStream>()->cuda_stream());
-    } else if (direction == "DESCENDING") {
-      SortPairsDescending(in->dptr<T>(), buf_manager.IndicesPtr(), instance_num, instance_size,
-                          buf_manager.TempStoragePtr(), buf_manager.TempStorageBytes(),
-                          buf_manager.SortedInPtr(), out->mut_dptr<int32_t>(),
-                          ctx->stream()->As<ep::CudaStream>()->cuda_stream());
-    } else {
-      UNIMPLEMENTED();
-    }
-  }
-  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
-};
-
-#define REGISTER_CUDA_ARG_SORT_KERNEL(dtype)                                                       \
-  REGISTER_USER_KERNEL("arg_sort")                                                                 \
-      .SetCreateFn<GpuArgSortKernel<dtype>>()                                                      \
-      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA)                             \
-                       && (user_op::HobDataType("in", 0) == GetDataType<dtype>::value))            \
-      .SetInferTmpSizeFn([](user_op::InferContext* ctx) {                                          \
-        const Shape& in_shape = ctx->InputShape("in", 0);                                          \
-        const int32_t elem_cnt = in_shape.elem_cnt();                                              \
-        const int32_t instance_size = in_shape.dim_vec().back();                                   \
-        const int32_t instance_num = elem_cnt / instance_size;                                     \
-                                                                                                   \
-        /* Sorted In */                                                                            \
-        const int32_t sorted_in_aligned_bytes = GetCudaAlignedSize(elem_cnt * sizeof(dtype));      \
-        /* Indices */                                                                              \
-        const int32_t indices_aligned_bytes = GetCudaAlignedSize(elem_cnt * sizeof(int32_t));      \
-        /* CUB Temp Storage */                                                                     \
-        int32_t temp_storage_bytes = -1;                                                           \
-        const std::string& direction = ctx->Attr<std::string>("direction");                        \
-        if (direction == "ASCENDING") {                                                            \
-          temp_storage_bytes =                                                                     \
-              InferTempStorageForSortPairsAscending<dtype, int32_t>(instance_num, instance_size);  \
-        } else if (direction == "DESCENDING") {                                                    \
-          temp_storage_bytes =                                                                     \
-              InferTempStorageForSortPairsDescending<dtype, int32_t>(instance_num, instance_size); \
-        } else {                                                                                   \
-          UNIMPLEMENTED();                                                                         \
-        }                                                                                          \
-                                                                                                   \
-        return sorted_in_aligned_bytes + indices_aligned_bytes + temp_storage_bytes;               \
-      });
-
-REGISTER_CUDA_ARG_SORT_KERNEL(float)
-REGISTER_CUDA_ARG_SORT_KERNEL(double)
-REGISTER_CUDA_ARG_SORT_KERNEL(bool)
-REGISTER_CUDA_ARG_SORT_KERNEL(int8_t)
-REGISTER_CUDA_ARG_SORT_KERNEL(uint8_t)
-REGISTER_CUDA_ARG_SORT_KERNEL(int32_t)
-REGISTER_CUDA_ARG_SORT_KERNEL(int64_t)
-
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "hip/hip_runtime.h"
+#include "oneflow/core/framework/framework.h"
+#include "oneflow/core/kernel/new_kernel_util.h"
+#include "oneflow/user/kernels/radix_sort.hip.h"
+#include "oneflow/core/ep/rocm/cuda_stream.h"
+
+namespace oneflow {
+
+namespace {
+
+template<typename T>
+class TmpBufferManager final {
+ public:
+  OF_DISALLOW_COPY_AND_MOVE(TmpBufferManager);
+  TmpBufferManager(int32_t capacity, void* ptr, const ShapeView& in_shape)
+      : capacity_{capacity},
+        sorted_in_elem_cnt_{in_shape.elem_cnt()},
+        indices_elem_cnt_{sorted_in_elem_cnt_} {
+    const int32_t sorted_in_aligned_bytes = GetCudaAlignedSize(sorted_in_elem_cnt_ * sizeof(T));
+    const int32_t indices_aligned_bytes = GetCudaAlignedSize(indices_elem_cnt_ * sizeof(int32_t));
+    sorted_in_ptr_ = reinterpret_cast<T*>(ptr);
+    indices_ptr_ = reinterpret_cast<int32_t*>(reinterpret_cast<char*>(sorted_in_ptr_)
+                                              + sorted_in_aligned_bytes);
+    temp_storage_ptr_ =
+        reinterpret_cast<void*>(reinterpret_cast<char*>(indices_ptr_) + indices_aligned_bytes);
+    temp_storage_bytes_ = capacity_ - sorted_in_aligned_bytes - indices_aligned_bytes;
+    CHECK_GE(temp_storage_bytes_, 0);
+  }
+  ~TmpBufferManager() = default;
+
+  T* SortedInPtr() const { return sorted_in_ptr_; }
+  int32_t* IndicesPtr() const { return indices_ptr_; }
+  void* TempStoragePtr() const { return temp_storage_ptr_; }
+
+  int32_t TempStorageBytes() const { return temp_storage_bytes_; }
+
+ private:
+  int32_t capacity_;
+
+  T* sorted_in_ptr_;
+  int32_t* indices_ptr_;
+  void* temp_storage_ptr_;
+
+  int64_t sorted_in_elem_cnt_;
+  int64_t indices_elem_cnt_;
+  int32_t temp_storage_bytes_;
+};
+
+__global__ void InitializeIndices(int32_t elem_cnt, int32_t* indices_ptr, int32_t instance_size) {
+  CUDA_1D_KERNEL_LOOP(i, elem_cnt) { indices_ptr[i] = i % instance_size; };
+}
+
+}  // namespace
+
+template<typename T>
+class GpuArgSortKernel final : public user_op::OpKernel {
+ public:
+  GpuArgSortKernel() = default;
+  ~GpuArgSortKernel() = default;
+
+ private:
+  using user_op::OpKernel::Compute;
+  void Compute(user_op::KernelComputeContext* ctx) const override {
+    const user_op::Tensor* in = ctx->Tensor4ArgNameAndIndex("in", 0);
+    user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0);
+    user_op::Tensor* tmp_buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0);
+    TmpBufferManager<T> buf_manager(static_cast<int32_t>(tmp_buffer->shape_view().elem_cnt()),
+                                    tmp_buffer->mut_dptr<void>(), in->shape_view());
+
+    const int32_t elem_cnt = in->shape_view().elem_cnt();
+    const int32_t instance_size = in->shape_view().At(in->shape_view().NumAxes() - 1);
+    const int32_t instance_num = elem_cnt / instance_size;
+    const std::string& direction = ctx->Attr<std::string>("direction");
+    InitializeIndices<<<BlocksNum4ThreadsNum(elem_cnt), kCudaThreadsNumPerBlock, 0,
+                        ctx->stream()->As<ep::CudaStream>()->cuda_stream()>>>(
+        elem_cnt, buf_manager.IndicesPtr(), instance_size);
+    if (direction == "ASCENDING") {
+      SortPairsAscending(in->dptr<T>(), buf_manager.IndicesPtr(), instance_num, instance_size,
+                         buf_manager.TempStoragePtr(), buf_manager.TempStorageBytes(),
+                         buf_manager.SortedInPtr(), out->mut_dptr<int32_t>(),
+                         ctx->stream()->As<ep::CudaStream>()->cuda_stream());
+    } else if (direction == "DESCENDING") {
+      SortPairsDescending(in->dptr<T>(), buf_manager.IndicesPtr(), instance_num, instance_size,
+                          buf_manager.TempStoragePtr(), buf_manager.TempStorageBytes(),
+                          buf_manager.SortedInPtr(), out->mut_dptr<int32_t>(),
+                          ctx->stream()->As<ep::CudaStream>()->cuda_stream());
+    } else {
+      UNIMPLEMENTED();
+    }
+  }
+  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
+};
+
+#define REGISTER_CUDA_ARG_SORT_KERNEL(dtype)                                                       \
+  REGISTER_USER_KERNEL("arg_sort")                                                                 \
+      .SetCreateFn<GpuArgSortKernel<dtype>>()                                                      \
+      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA)                             \
+                       && (user_op::HobDataType("in", 0) == GetDataType<dtype>::value))            \
+      .SetInferTmpSizeFn([](user_op::InferContext* ctx) {                                          \
+        const Shape& in_shape = ctx->InputShape("in", 0);                                          \
+        const int32_t elem_cnt = in_shape.elem_cnt();                                              \
+        const int32_t instance_size = in_shape.dim_vec().back();                                   \
+        const int32_t instance_num = elem_cnt / instance_size;                                     \
+                                                                                                   \
+        /* Sorted In */                                                                            \
+        const int32_t sorted_in_aligned_bytes = GetCudaAlignedSize(elem_cnt * sizeof(dtype));      \
+        /* Indices */                                                                              \
+        const int32_t indices_aligned_bytes = GetCudaAlignedSize(elem_cnt * sizeof(int32_t));      \
+        /* CUB Temp Storage */                                                                     \
+        int32_t temp_storage_bytes = -1;                                                           \
+        const std::string& direction = ctx->Attr<std::string>("direction");                        \
+        if (direction == "ASCENDING") {                                                            \
+          temp_storage_bytes =                                                                     \
+              InferTempStorageForSortPairsAscending<dtype, int32_t>(instance_num, instance_size);  \
+        } else if (direction == "DESCENDING") {                                                    \
+          temp_storage_bytes =                                                                     \
+              InferTempStorageForSortPairsDescending<dtype, int32_t>(instance_num, instance_size); \
+        } else {                                                                                   \
+          UNIMPLEMENTED();                                                                         \
+        }                                                                                          \
+                                                                                                   \
+        return sorted_in_aligned_bytes + indices_aligned_bytes + temp_storage_bytes;               \
+      });
+
+REGISTER_CUDA_ARG_SORT_KERNEL(float)
+REGISTER_CUDA_ARG_SORT_KERNEL(double)
+REGISTER_CUDA_ARG_SORT_KERNEL(bool)
+REGISTER_CUDA_ARG_SORT_KERNEL(int8_t)
+REGISTER_CUDA_ARG_SORT_KERNEL(uint8_t)
+REGISTER_CUDA_ARG_SORT_KERNEL(int32_t)
+REGISTER_CUDA_ARG_SORT_KERNEL(int64_t)
+
 }  // namespace oneflow
\ No newline at end of file
--- a/oneflow/user/kernels/arg_where_kernel_util.hip.cpp
+++ b/oneflow/user/kernels/arg_where_kernel_util.hip.cpp
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#include "hip/hip_runtime.h"
-#include "oneflow/user/kernels/arg_where_kernel_util.h"
-#include "oneflow/core/common/nd_index_offset_helper.h"
-#include "oneflow/core/common/small_vector.h"
-#include "oneflow/core/hip/elementwise.hip.h"
-#include "oneflow/core/kernel/kernel_util.h"
-#include "oneflow/core/ep/rocm/cuda_stream.h"
-#include <hipcub/hipcub.hpp>
-
-namespace oneflow {
-
-namespace {
-
-constexpr int kBlockSize = cuda::elementwise::kBlockSize;
-
-int GetNumBlocks(int64_t elem_cnt) {
-  int num_blocks = 0;
-  OF_CUDA_CHECK(cuda::elementwise::GetNumBlocks(elem_cnt, &num_blocks));
-  return num_blocks;
-}
-
-template<typename T, int NDIM>
-struct StrideIterator {
-  typedef StrideIterator self_type;
-  typedef std::ptrdiff_t difference_type;
-  typedef T value_type;
-  typedef T* pointer;
-  typedef T& reference;
-  typedef std::random_access_iterator_tag iterator_category;
-
-  explicit StrideIterator(T* ptr, size_t max_iters) : ptr_(ptr), max_iters_(max_iters) {}
-
-  OF_DEVICE_FUNC reference operator[](int i) {
-    assert(0 <= i && i < max_iters_);
-    return *(ptr_ + (i * NDIM));
-  }
-
- private:
-  T* ptr_;
-  size_t max_iters_;
-};
-
-template<typename T, int NDIM>
-__global__ void __launch_bounds__(kBlockSize)
-    CudaOffsetToNdIndexInplace(NdIndexOffsetHelper<T, NDIM> index_converter,
-                               const T* output_size_ptr, T* output_ptr) {
-  CUDA_1D_KERNEL_LOOP_T(T, i, *output_size_ptr) {
-    T* index_ptr = output_ptr + i * NDIM;
-    index_converter.OffsetToNdIndex(*index_ptr, index_ptr);
-  }
-}
-
-template<typename T>
-struct IsTrue {
-  __device__ __forceinline__ bool operator()(const T& val) const { return static_cast<bool>(val); }
-};
-
-template<typename IN_T, typename OUT_T, typename OUT_ITER>
-hipError_t SelectTrue(hipStream_t stream, int num_items, void* temp_storage,
-                       size_t& temp_storage_bytes, const IN_T* input, OUT_ITER output_iter,
-                       OUT_T* num_selected) {
-  IsTrue<IN_T> is_true;
-  hipcub::TransformInputIterator<bool, IsTrue<IN_T>, const IN_T*> flag_iter(input, is_true);
-  hipcub::CountingInputIterator<OUT_T> offset_counter(0);
-  return hipcub::DeviceSelect::Flagged(temp_storage, temp_storage_bytes, offset_counter, flag_iter,
-                                    output_iter, num_selected, num_items, stream, false);
-}
-
-}  // namespace
-
-template<typename IN_T, typename OUT_T, int NDIM>
-struct ArgWhereKernelUtil<DeviceType::kCUDA, IN_T, OUT_T, NDIM> {
-  static void ArgWhere(ep::Stream* stream, const ShapeView& input_shape, const IN_T* input_ptr,
-                       void* temp_storage, size_t temp_storage_bytes, OUT_T* output_ptr,
-                       OUT_T* output_size_ptr) {
-    const int64_t elem_cnt = input_shape.elem_cnt();
-    // deal with empty blob
-    if (elem_cnt == 0) {
-      Memset<DeviceType::kCUDA>(stream, output_size_ptr, 0, sizeof(OUT_T));
-      return;
-    }
-
-    CHECK_NOTNULL(stream);
-    CHECK_LE(elem_cnt, std::numeric_limits<OUT_T>::max());
-    size_t workspace = GetWorkspaceBytesSize(stream, elem_cnt);
-    CHECK_LE(workspace, temp_storage_bytes);
-
-    if (NDIM == 1) {
-      OF_CUDA_CHECK((SelectTrue<IN_T, OUT_T, OUT_T*>(
-          stream->As<ep::CudaStream>()->cuda_stream(), input_shape.elem_cnt(), temp_storage,
-          workspace, input_ptr, output_ptr, output_size_ptr)));
-    } else {
-      using OutputIterator = StrideIterator<OUT_T, NDIM>;
-      OutputIterator output_iter(output_ptr, elem_cnt);
-      OF_CUDA_CHECK((SelectTrue<IN_T, OUT_T, OutputIterator>(
-          stream->As<ep::CudaStream>()->cuda_stream(), elem_cnt, temp_storage, workspace, input_ptr,
-          output_iter, output_size_ptr)));
-
-      OUT_T dims[NDIM] = {0};
-      std::transform(input_shape.ptr(), input_shape.ptr() + input_shape.NumAxes(), dims,
-                     [](int64_t dim) { return static_cast<OUT_T>(dim); });
-      NdIndexOffsetHelper<OUT_T, NDIM> index_converter(dims);
-      CudaOffsetToNdIndexInplace<OUT_T, NDIM>
-          <<<GetNumBlocks(elem_cnt), kBlockSize, 0, stream->As<ep::CudaStream>()->cuda_stream()>>>(
-              index_converter, output_size_ptr, output_ptr);
-    }
-  }
-
-  static size_t GetWorkspaceBytesSize(ep::Stream* stream, int64_t elem_cnt) {
-    hipStream_t cuda_stream = stream ? stream->As<ep::CudaStream>()->cuda_stream() : 0;
-    size_t workspace = 0;
-    if (NDIM == 1) {
-      OF_CUDA_CHECK((SelectTrue<IN_T, OUT_T, OUT_T*>(cuda_stream, elem_cnt, nullptr, workspace,
-                                                     nullptr, nullptr, nullptr)));
-    } else {
-      using OutputIterator = StrideIterator<OUT_T, NDIM>;
-      OutputIterator output_iter(nullptr, elem_cnt);
-      OF_CUDA_CHECK((SelectTrue<IN_T, OUT_T, OutputIterator>(
-          cuda_stream, elem_cnt, nullptr, workspace, nullptr, output_iter, nullptr)));
-    }
-    return workspace;
-  }
-};
-
-INSTANTIATE_ARG_WHERE_KERNEL_UTIL_FOR_DEVICE(DeviceType::kCUDA)
-
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "hip/hip_runtime.h"
+#include "oneflow/user/kernels/arg_where_kernel_util.h"
+#include "oneflow/core/common/nd_index_offset_helper.h"
+#include "oneflow/core/common/small_vector.h"
+#include "oneflow/core/hip/elementwise.hip.h"
+#include "oneflow/core/kernel/kernel_util.h"
+#include "oneflow/core/ep/rocm/cuda_stream.h"
+#include <hipcub/hipcub.hpp>
+
+namespace oneflow {
+
+namespace {
+
+constexpr int kBlockSize = cuda::elementwise::kBlockSize;
+
+int GetNumBlocks(int64_t elem_cnt) {
+  int num_blocks = 0;
+  OF_CUDA_CHECK(cuda::elementwise::GetNumBlocks(elem_cnt, &num_blocks));
+  return num_blocks;
+}
+
+template<typename T, int NDIM>
+struct StrideIterator {
+  typedef StrideIterator self_type;
+  typedef std::ptrdiff_t difference_type;
+  typedef T value_type;
+  typedef T* pointer;
+  typedef T& reference;
+  typedef std::random_access_iterator_tag iterator_category;
+
+  explicit StrideIterator(T* ptr, size_t max_iters) : ptr_(ptr), max_iters_(max_iters) {}
+
+  OF_DEVICE_FUNC reference operator[](int i) {
+    assert(0 <= i && i < max_iters_);
+    return *(ptr_ + (i * NDIM));
+  }
+
+ private:
+  T* ptr_;
+  size_t max_iters_;
+};
+
+template<typename T, int NDIM>
+__global__ void __launch_bounds__(kBlockSize)
+    CudaOffsetToNdIndexInplace(NdIndexOffsetHelper<T, NDIM> index_converter,
+                               const T* output_size_ptr, T* output_ptr) {
+  CUDA_1D_KERNEL_LOOP_T(T, i, *output_size_ptr) {
+    T* index_ptr = output_ptr + i * NDIM;
+    index_converter.OffsetToNdIndex(*index_ptr, index_ptr);
+  }
+}
+
+template<typename T>
+struct IsTrue {
+  __device__ __forceinline__ bool operator()(const T& val) const { return static_cast<bool>(val); }
+};
+
+template<typename IN_T, typename OUT_T, typename OUT_ITER>
+hipError_t SelectTrue(hipStream_t stream, int num_items, void* temp_storage,
+                       size_t& temp_storage_bytes, const IN_T* input, OUT_ITER output_iter,
+                       OUT_T* num_selected) {
+  IsTrue<IN_T> is_true;
+  hipcub::TransformInputIterator<bool, IsTrue<IN_T>, const IN_T*> flag_iter(input, is_true);
+  hipcub::CountingInputIterator<OUT_T> offset_counter(0);
+  return hipcub::DeviceSelect::Flagged(temp_storage, temp_storage_bytes, offset_counter, flag_iter,
+                                    output_iter, num_selected, num_items, stream, false);
+}
+
+}  // namespace
+
+template<typename IN_T, typename OUT_T, int NDIM>
+struct ArgWhereKernelUtil<DeviceType::kCUDA, IN_T, OUT_T, NDIM> {
+  static void ArgWhere(ep::Stream* stream, const ShapeView& input_shape, const IN_T* input_ptr,
+                       void* temp_storage, size_t temp_storage_bytes, OUT_T* output_ptr,
+                       OUT_T* output_size_ptr) {
+    const int64_t elem_cnt = input_shape.elem_cnt();
+    // deal with empty blob
+    if (elem_cnt == 0) {
+      Memset<DeviceType::kCUDA>(stream, output_size_ptr, 0, sizeof(OUT_T));
+      return;
+    }
+
+    CHECK_NOTNULL(stream);
+    CHECK_LE(elem_cnt, std::numeric_limits<OUT_T>::max());
+    size_t workspace = GetWorkspaceBytesSize(stream, elem_cnt);
+    CHECK_LE(workspace, temp_storage_bytes);
+
+    if (NDIM == 1) {
+      OF_CUDA_CHECK((SelectTrue<IN_T, OUT_T, OUT_T*>(
+          stream->As<ep::CudaStream>()->cuda_stream(), input_shape.elem_cnt(), temp_storage,
+          workspace, input_ptr, output_ptr, output_size_ptr)));
+    } else {
+      using OutputIterator = StrideIterator<OUT_T, NDIM>;
+      OutputIterator output_iter(output_ptr, elem_cnt);
+      OF_CUDA_CHECK((SelectTrue<IN_T, OUT_T, OutputIterator>(
+          stream->As<ep::CudaStream>()->cuda_stream(), elem_cnt, temp_storage, workspace, input_ptr,
+          output_iter, output_size_ptr)));
+
+      OUT_T dims[NDIM] = {0};
+      std::transform(input_shape.ptr(), input_shape.ptr() + input_shape.NumAxes(), dims,
+                     [](int64_t dim) { return static_cast<OUT_T>(dim); });
+      NdIndexOffsetHelper<OUT_T, NDIM> index_converter(dims);
+      CudaOffsetToNdIndexInplace<OUT_T, NDIM>
+          <<<GetNumBlocks(elem_cnt), kBlockSize, 0, stream->As<ep::CudaStream>()->cuda_stream()>>>(
+              index_converter, output_size_ptr, output_ptr);
+    }
+  }
+
+  static size_t GetWorkspaceBytesSize(ep::Stream* stream, int64_t elem_cnt) {
+    hipStream_t cuda_stream = stream ? stream->As<ep::CudaStream>()->cuda_stream() : 0;
+    size_t workspace = 0;
+    if (NDIM == 1) {
+      OF_CUDA_CHECK((SelectTrue<IN_T, OUT_T, OUT_T*>(cuda_stream, elem_cnt, nullptr, workspace,
+                                                     nullptr, nullptr, nullptr)));
+    } else {
+      using OutputIterator = StrideIterator<OUT_T, NDIM>;
+      OutputIterator output_iter(nullptr, elem_cnt);
+      OF_CUDA_CHECK((SelectTrue<IN_T, OUT_T, OutputIterator>(
+          cuda_stream, elem_cnt, nullptr, workspace, nullptr, output_iter, nullptr)));
+    }
+    return workspace;
+  }
+};
+
+INSTANTIATE_ARG_WHERE_KERNEL_UTIL_FOR_DEVICE(DeviceType::kCUDA)
+
 }  // namespace oneflow
\ No newline at end of file
--- a/oneflow/user/kernels/argmax_kernel.hip.cpp
+++ b/oneflow/user/kernels/argmax_kernel.hip.cpp
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#include "hip/hip_runtime.h"
-#include "oneflow/core/framework/framework.h"
-#include <hipcub/hipcub.hpp>
-#include "oneflow/core/ep/rocm/cuda_stream.h"
-
-namespace oneflow {
-
-namespace {
-
-template<typename T>
-class TmpBufferManager final {
- public:
-  OF_DISALLOW_COPY_AND_MOVE(TmpBufferManager);
-  TmpBufferManager(int32_t capacity, void* ptr, int32_t instance_num)
-      : capacity_{capacity}, key_value_out_elem_cnt_{instance_num} {
-    const int32_t key_value_out_aligned_bytes =
-        GetCudaAlignedSize(key_value_out_elem_cnt_ * sizeof(hipcub::KeyValuePair<int32_t, T>));
-    key_value_out_ptr_ = reinterpret_cast<hipcub::KeyValuePair<int32_t, T>*>(ptr);
-    temp_storage_ptr_ = reinterpret_cast<void*>(reinterpret_cast<char*>(key_value_out_ptr_)
-                                                + key_value_out_aligned_bytes);
-    temp_storage_bytes_ = capacity_ - key_value_out_aligned_bytes;
-    CHECK_GE(temp_storage_bytes_, 0);
-  }
-  ~TmpBufferManager() = default;
-
-  hipcub::KeyValuePair<int32_t, T>* KeyValueOutPtr() const { return key_value_out_ptr_; }
-  void* TempStoragePtr() const { return temp_storage_ptr_; }
-
-  int32_t TempStorageBytes() const { return temp_storage_bytes_; }
-
- private:
-  int32_t capacity_;
-
-  hipcub::KeyValuePair<int32_t, T>* key_value_out_ptr_;
-  void* temp_storage_ptr_;
-
-  int32_t key_value_out_elem_cnt_;
-  int32_t temp_storage_bytes_;
-};
-
-class MultiplyFunctor final {
- public:
-  MultiplyFunctor(int32_t num_col) : num_col_(num_col) {}
-  __host__ __device__ __forceinline__ int32_t operator()(int32_t idx) const {
-    return idx * num_col_;
-  }
-
- private:
-  int32_t num_col_;
-};
-
-template<typename T>
-size_t InferTempStorageForArgMax(int32_t num_row, int32_t num_col) {
-  using SegmentOffsetIter =
-      hipcub::TransformInputIterator<int32_t, MultiplyFunctor, hipcub::CountingInputIterator<int32_t>>;
-  hipcub::CountingInputIterator<int32_t> counting_iter(0);
-  MultiplyFunctor multiply_functor(num_col);
-  SegmentOffsetIter segment_offset_iter(counting_iter, multiply_functor);
-
-  size_t temp_storage_bytes = 0;
-  auto err =
-      hipcub::DeviceSegmentedReduce::ArgMax<T*, hipcub::KeyValuePair<int32_t, T>*, SegmentOffsetIter>(
-          /* d_temp_storage */ nullptr, /* temp_storage_bytes */ temp_storage_bytes,
-          /* d_in */ nullptr, /* d_out */ nullptr, /* num_segments */ num_row,
-          /* d_begin_offsets */ segment_offset_iter, /* d_end_offsets */ segment_offset_iter + 1,
-          /* stream */ 0);
-
-  // auto err =
-  //   hipcub::DeviceReduce::ArgMax<T*, hipcub::KeyValuePair<int32_t, T>*>(
-  //                   nullptr, temp_storage_bytes,
-  //                   nullptr, nullptr, num_row,
-  //                   0);
-
-  OF_CUDA_CHECK(err);
-
-  return temp_storage_bytes;
-}
-
-template<typename T>
-void ArgMax(const T* in_ptr, int32_t num_row, int32_t num_col, void* temp_storage_ptr,
-            int32_t temp_storage_bytes, hipcub::KeyValuePair<int32_t, T>* out_ptr,
-            hipStream_t stream) {
-  size_t rt_inferred_temp_storage_bytes = InferTempStorageForArgMax<T>(num_row, num_col);
-  CHECK_LE(rt_inferred_temp_storage_bytes, temp_storage_bytes);
-
-  using SegmentOffsetIter =
-      hipcub::TransformInputIterator<int32_t, MultiplyFunctor, hipcub::CountingInputIterator<int32_t>>;
-  hipcub::CountingInputIterator<int32_t> counting_iter(0);
-  MultiplyFunctor multiply_functor(num_col);
-  SegmentOffsetIter segment_offset_iter(counting_iter, multiply_functor);
-
-  // void * d_temp_storage = nullptr;
-  // hipMalloc((void **)&d_temp_storage, rt_inferred_temp_storage_bytes);
-
-  auto err = hipcub::DeviceSegmentedReduce::ArgMax(
-      /* d_temp_storage */ temp_storage_ptr,
-      /* temp_storage_bytes */ rt_inferred_temp_storage_bytes,
-      /* d_in */ in_ptr,
-      /* d_out */ out_ptr,
-      /* num_segments */ num_row,
-      /* d_begin_offsets */ segment_offset_iter,
-      /* d_end_offsets */ segment_offset_iter + 1,
-      /* stream */ stream);
-
-  // auto err =
-  //   hipcub::DeviceReduce::ArgMax(
-  //                   d_temp_storage, rt_inferred_temp_storage_bytes,
-  //                   in_ptr, out_ptr, num_row,
-  //                   stream);
-
-  OF_CUDA_CHECK(err);
-}
-
-template<typename T>
-__global__ void WriteKeysToOutput(const int32_t instance_num, const int32_t instance_size, 
-                                  const hipcub::KeyValuePair<int32_t, T>* key_value_out_ptr,
-                                  int64_t* out_ptr) {
-  CUDA_1D_KERNEL_LOOP(i, instance_num) { out_ptr[i] = key_value_out_ptr[i].key % instance_size; }
-}
-
-}  // namespace
-
-template<typename T>
-class GpuArgMaxKernel final : public user_op::OpKernel {
- public:
-  GpuArgMaxKernel() = default;
-  ~GpuArgMaxKernel() = default;
-
- private:
-  using user_op::OpKernel::Compute;
-  void Compute(user_op::KernelComputeContext* ctx) const override {
-    const user_op::Tensor* in = ctx->Tensor4ArgNameAndIndex("in", 0);
-    user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0);
-    user_op::Tensor* tmp_buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0);
-
-    const int32_t elem_cnt = in->shape_view().elem_cnt();
-    const int32_t instance_size = in->shape_view().At(in->shape_view().NumAxes() - 1);
-    const int32_t instance_num = elem_cnt / instance_size;
-    TmpBufferManager<T> buffer_manager(tmp_buffer->shape_view().elem_cnt(),
-                                       tmp_buffer->mut_dptr<void>(), instance_num);
-
-    ArgMax(in->dptr<T>(), instance_num, instance_size, buffer_manager.TempStoragePtr(),
-           buffer_manager.TempStorageBytes(), buffer_manager.KeyValueOutPtr(),
-           ctx->stream()->As<ep::CudaStream>()->cuda_stream());
-    WriteKeysToOutput<T><<<BlocksNum4ThreadsNum(instance_num), kCudaThreadsNumPerBlock, 0,
-                           ctx->stream()->As<ep::CudaStream>()->cuda_stream()>>>(
-        instance_num, instance_size, buffer_manager.KeyValueOutPtr(), out->mut_dptr<int64_t>());
-  }
-  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
-};
-
-#define REGISTER_CUDA_ARGMAX_KERNEL(dtype)                                                         \
-  REGISTER_USER_KERNEL("argmax")                                                                   \
-      .SetCreateFn<GpuArgMaxKernel<dtype>>()                                                       \
-      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA)                             \
-                       && (user_op::HobDataType("in", 0) == GetDataType<dtype>::value))            \
-      .SetInferTmpSizeFn([](user_op::InferContext* ctx) {                                          \
-        const Shape& in_shape = ctx->InputShape("in", 0);                                          \
-        const int32_t instance_size = in_shape.dim_vec().back();                                   \
-        const int32_t instance_num = in_shape.elem_cnt() / instance_size;                          \
-                                                                                                   \
-        /* Key-Value Out */                                                                        \
-        int32_t key_value_out_bytes =                                                              \
-            GetCudaAlignedSize(instance_num * sizeof(hipcub::KeyValuePair<int32_t, dtype>));          \
-                                                                                                   \
-        /* CUB Temp Storage */                                                                     \
-        size_t temp_storage_bytes = InferTempStorageForArgMax<dtype>(instance_num, instance_size); \
-                                                                                                   \
-        return key_value_out_bytes + temp_storage_bytes;                                           \
-      });
-
-REGISTER_CUDA_ARGMAX_KERNEL(float)
-REGISTER_CUDA_ARGMAX_KERNEL(double)
-REGISTER_CUDA_ARGMAX_KERNEL(uint8_t)
-REGISTER_CUDA_ARGMAX_KERNEL(int8_t)
-REGISTER_CUDA_ARGMAX_KERNEL(int32_t)
-REGISTER_CUDA_ARGMAX_KERNEL(int64_t)
-
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "hip/hip_runtime.h"
+#include "oneflow/core/framework/framework.h"
+#include <hipcub/hipcub.hpp>
+#include "oneflow/core/ep/rocm/cuda_stream.h"
+
+namespace oneflow {
+
+namespace {
+
+template<typename T>
+class TmpBufferManager final {
+ public:
+  OF_DISALLOW_COPY_AND_MOVE(TmpBufferManager);
+  TmpBufferManager(int32_t capacity, void* ptr, int32_t instance_num)
+      : capacity_{capacity}, key_value_out_elem_cnt_{instance_num} {
+    const int32_t key_value_out_aligned_bytes =
+        GetCudaAlignedSize(key_value_out_elem_cnt_ * sizeof(hipcub::KeyValuePair<int32_t, T>));
+    key_value_out_ptr_ = reinterpret_cast<hipcub::KeyValuePair<int32_t, T>*>(ptr);
+    temp_storage_ptr_ = reinterpret_cast<void*>(reinterpret_cast<char*>(key_value_out_ptr_)
+                                                + key_value_out_aligned_bytes);
+    temp_storage_bytes_ = capacity_ - key_value_out_aligned_bytes;
+    CHECK_GE(temp_storage_bytes_, 0);
+  }
+  ~TmpBufferManager() = default;
+
+  hipcub::KeyValuePair<int32_t, T>* KeyValueOutPtr() const { return key_value_out_ptr_; }
+  void* TempStoragePtr() const { return temp_storage_ptr_; }
+
+  int32_t TempStorageBytes() const { return temp_storage_bytes_; }
+
+ private:
+  int32_t capacity_;
+
+  hipcub::KeyValuePair<int32_t, T>* key_value_out_ptr_;
+  void* temp_storage_ptr_;
+
+  int32_t key_value_out_elem_cnt_;
+  int32_t temp_storage_bytes_;
+};
+
+class MultiplyFunctor final {
+ public:
+  MultiplyFunctor(int32_t num_col) : num_col_(num_col) {}
+  __host__ __device__ __forceinline__ int32_t operator()(int32_t idx) const {
+    return idx * num_col_;
+  }
+
+ private:
+  int32_t num_col_;
+};
+
+template<typename T>
+size_t InferTempStorageForArgMax(int32_t num_row, int32_t num_col) {
+  using SegmentOffsetIter =
+      hipcub::TransformInputIterator<int32_t, MultiplyFunctor, hipcub::CountingInputIterator<int32_t>>;
+  hipcub::CountingInputIterator<int32_t> counting_iter(0);
+  MultiplyFunctor multiply_functor(num_col);
+  SegmentOffsetIter segment_offset_iter(counting_iter, multiply_functor);
+
+  size_t temp_storage_bytes = 0;
+  auto err =
+      hipcub::DeviceSegmentedReduce::ArgMax<T*, hipcub::KeyValuePair<int32_t, T>*, SegmentOffsetIter>(
+          /* d_temp_storage */ nullptr, /* temp_storage_bytes */ temp_storage_bytes,
+          /* d_in */ nullptr, /* d_out */ nullptr, /* num_segments */ num_row,
+          /* d_begin_offsets */ segment_offset_iter, /* d_end_offsets */ segment_offset_iter + 1,
+          /* stream */ 0);
+
+  // auto err =
+  //   hipcub::DeviceReduce::ArgMax<T*, hipcub::KeyValuePair<int32_t, T>*>(
+  //                   nullptr, temp_storage_bytes,
+  //                   nullptr, nullptr, num_row,
+  //                   0);
+
+  OF_CUDA_CHECK(err);
+
+  return temp_storage_bytes;
+}
+
+template<typename T>
+void ArgMax(const T* in_ptr, int32_t num_row, int32_t num_col, void* temp_storage_ptr,
+            int32_t temp_storage_bytes, hipcub::KeyValuePair<int32_t, T>* out_ptr,
+            hipStream_t stream) {
+  size_t rt_inferred_temp_storage_bytes = InferTempStorageForArgMax<T>(num_row, num_col);
+  CHECK_LE(rt_inferred_temp_storage_bytes, temp_storage_bytes);
+
+  using SegmentOffsetIter =
+      hipcub::TransformInputIterator<int32_t, MultiplyFunctor, hipcub::CountingInputIterator<int32_t>>;
+  hipcub::CountingInputIterator<int32_t> counting_iter(0);
+  MultiplyFunctor multiply_functor(num_col);
+  SegmentOffsetIter segment_offset_iter(counting_iter, multiply_functor);
+
+  // void * d_temp_storage = nullptr;
+  // hipMalloc((void **)&d_temp_storage, rt_inferred_temp_storage_bytes);
+
+  auto err = hipcub::DeviceSegmentedReduce::ArgMax(
+      /* d_temp_storage */ temp_storage_ptr,
+      /* temp_storage_bytes */ rt_inferred_temp_storage_bytes,
+      /* d_in */ in_ptr,
+      /* d_out */ out_ptr,
+      /* num_segments */ num_row,
+      /* d_begin_offsets */ segment_offset_iter,
+      /* d_end_offsets */ segment_offset_iter + 1,
+      /* stream */ stream);
+
+  // auto err =
+  //   hipcub::DeviceReduce::ArgMax(
+  //                   d_temp_storage, rt_inferred_temp_storage_bytes,
+  //                   in_ptr, out_ptr, num_row,
+  //                   stream);
+
+  OF_CUDA_CHECK(err);
+}
+
+template<typename T>
+__global__ void WriteKeysToOutput(const int32_t instance_num, const int32_t instance_size, 
+                                  const hipcub::KeyValuePair<int32_t, T>* key_value_out_ptr,
+                                  int64_t* out_ptr) {
+  CUDA_1D_KERNEL_LOOP(i, instance_num) { out_ptr[i] = key_value_out_ptr[i].key % instance_size; }
+}
+
+}  // namespace
+
+template<typename T>
+class GpuArgMaxKernel final : public user_op::OpKernel {
+ public:
+  GpuArgMaxKernel() = default;
+  ~GpuArgMaxKernel() = default;
+
+ private:
+  using user_op::OpKernel::Compute;
+  void Compute(user_op::KernelComputeContext* ctx) const override {
+    const user_op::Tensor* in = ctx->Tensor4ArgNameAndIndex("in", 0);
+    user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0);
+    user_op::Tensor* tmp_buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0);
+
+    const int32_t elem_cnt = in->shape_view().elem_cnt();
+    const int32_t instance_size = in->shape_view().At(in->shape_view().NumAxes() - 1);
+    const int32_t instance_num = elem_cnt / instance_size;
+    TmpBufferManager<T> buffer_manager(tmp_buffer->shape_view().elem_cnt(),
+                                       tmp_buffer->mut_dptr<void>(), instance_num);
+
+    ArgMax(in->dptr<T>(), instance_num, instance_size, buffer_manager.TempStoragePtr(),
+           buffer_manager.TempStorageBytes(), buffer_manager.KeyValueOutPtr(),
+           ctx->stream()->As<ep::CudaStream>()->cuda_stream());
+    WriteKeysToOutput<T><<<BlocksNum4ThreadsNum(instance_num), kCudaThreadsNumPerBlock, 0,
+                           ctx->stream()->As<ep::CudaStream>()->cuda_stream()>>>(
+        instance_num, instance_size, buffer_manager.KeyValueOutPtr(), out->mut_dptr<int64_t>());
+  }
+  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
+};
+
+#define REGISTER_CUDA_ARGMAX_KERNEL(dtype)                                                         \
+  REGISTER_USER_KERNEL("argmax")                                                                   \
+      .SetCreateFn<GpuArgMaxKernel<dtype>>()                                                       \
+      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA)                             \
+                       && (user_op::HobDataType("in", 0) == GetDataType<dtype>::value))            \
+      .SetInferTmpSizeFn([](user_op::InferContext* ctx) {                                          \
+        const Shape& in_shape = ctx->InputShape("in", 0);                                          \
+        const int32_t instance_size = in_shape.dim_vec().back();                                   \
+        const int32_t instance_num = in_shape.elem_cnt() / instance_size;                          \
+                                                                                                   \
+        /* Key-Value Out */                                                                        \
+        int32_t key_value_out_bytes =                                                              \
+            GetCudaAlignedSize(instance_num * sizeof(hipcub::KeyValuePair<int32_t, dtype>));          \
+                                                                                                   \
+        /* CUB Temp Storage */                                                                     \
+        size_t temp_storage_bytes = InferTempStorageForArgMax<dtype>(instance_num, instance_size); \
+                                                                                                   \
+        return key_value_out_bytes + temp_storage_bytes;                                           \
+      });
+
+REGISTER_CUDA_ARGMAX_KERNEL(float)
+REGISTER_CUDA_ARGMAX_KERNEL(double)
+REGISTER_CUDA_ARGMAX_KERNEL(uint8_t)
+REGISTER_CUDA_ARGMAX_KERNEL(int8_t)
+REGISTER_CUDA_ARGMAX_KERNEL(int32_t)
+REGISTER_CUDA_ARGMAX_KERNEL(int64_t)
+
 }  // namespace oneflow
\ No newline at end of file
--- a/oneflow/user/kernels/as_strided_kernel.hip.cpp
+++ b/oneflow/user/kernels/as_strided_kernel.hip.cpp
-#include "hip/hip_runtime.h"
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-#include <cstdint>
-#include "oneflow/core/hip/atomic.hip.h"
-#include "oneflow/core/common/just.h"
-#include "oneflow/core/common/util.h"
-#include "oneflow/core/framework/consistency_check.h"
-#include "oneflow/core/framework/framework.h"
-#include "oneflow/core/kernel/new_kernel_util.h"
-#include "oneflow/core/kernel/kernel_util.h"
-#include "oneflow/core/ep/rocm/cuda_stream.h"
-#include "oneflow/core/common/nd_index_offset_helper.h"
-
-namespace oneflow {
-
-namespace {
-
-constexpr size_t NUM_DIM = 8;
-
-template<size_t num_dims, typename IndexType>
-struct AsStridedParams {
-  NdIndexOffsetHelper<IndexType, num_dims> destIndexOffsetHelper;
-  int64_t dest_dims[num_dims];
-  int32_t stride[num_dims];
-  int32_t dest_num_dims;
-  int32_t storage_offset;
-  int32_t input_num;
-  int32_t output_num;
-};
-
-template<typename T>
-__global__ void AsStrided_kernel(const T* input_buf, T* output_buf,
-                                 AsStridedParams<NUM_DIM, int64_t> params) {
-  const int64_t* dest_dims = reinterpret_cast<const int64_t*>(params.dest_dims);
-  const int32_t* stride = reinterpret_cast<const int32_t*>(params.stride);
-
-  CUDA_1D_KERNEL_LOOP_T(int64_t, i, params.output_num) {
-    int64_t dst_index[NUM_DIM];
-    params.destIndexOffsetHelper.OffsetToNdIndex(i, dst_index, params.dest_num_dims);
-    int32_t index_in_input = params.storage_offset;
-    FOR_RANGE(int64_t, j, 0, params.dest_num_dims) { index_in_input += dst_index[j] * stride[j]; }
-    output_buf[i] = input_buf[index_in_input];
-  }
-}
-
-template<typename T>
-__global__ void AsStridedGrad_kernel(const T* dy_buf, T* dx_buf,
-                                     AsStridedParams<NUM_DIM, int64_t> params) {
-  const int64_t* dest_dims = reinterpret_cast<const int64_t*>(params.dest_dims);
-  const int32_t* stride = reinterpret_cast<const int32_t*>(params.stride);
-  CUDA_1D_KERNEL_LOOP_T(int64_t, i, params.output_num) {
-    int64_t dy_index[NUM_DIM];
-    params.destIndexOffsetHelper.OffsetToNdIndex(i, dy_index, params.dest_num_dims);
-    int32_t index_in_dx = params.storage_offset;
-    FOR_RANGE(int64_t, j, 0, params.dest_num_dims) { index_in_dx += dy_index[j] * stride[j]; }
-    cuda::atomic::Add(dx_buf + index_in_dx, dy_buf[i]);
-  }
-}
-
-template<typename T>
-struct AsStridedFunctor final {
-  void operator()(ep::Stream* stream, const T* input_buf, T* output_buf, const int64_t* dest_dims,
-                  const int32_t* stride, const int32_t dest_num_dims, const int32_t storage_offset,
-                  const int32_t input_num, const int32_t output_num) {
-    NdIndexOffsetHelper<int64_t, NUM_DIM> destIndexOffsetHelper(dest_dims, dest_num_dims);
-    AsStridedParams<NUM_DIM, int64_t> params;
-    params.destIndexOffsetHelper = destIndexOffsetHelper;
-    FOR_RANGE(size_t, i, 0, dest_num_dims) {
-      params.dest_dims[i] = dest_dims[i];
-      params.stride[i] = stride[i];
-    }
-    params.dest_num_dims = dest_num_dims;
-    params.storage_offset = storage_offset;
-    params.input_num = input_num;
-    params.output_num = output_num;
-
-    AsStrided_kernel<T>
-        <<<BlocksNum4ThreadsNum(output_num), kCudaThreadsNumPerBlock, 0,
-           stream->As<ep::CudaStream>()->cuda_stream()>>>(input_buf, output_buf, params);
-  }
-};
-
-template<typename T>
-struct AsStridedGradFunctor final {
-  void operator()(ep::Stream* stream, const T* dy_buf, T* dx_buf, const int64_t* dy_dims,
-                  const int32_t* stride, const int32_t dy_num_dims, const int32_t storage_offset,
-                  const int32_t dx_num, const int32_t dy_num) {
-    NdIndexOffsetHelper<int64_t, NUM_DIM> dyIndexOffsetHelper(dy_dims, dy_num_dims);
-    AsStridedParams<NUM_DIM, int64_t> params;
-    params.destIndexOffsetHelper = dyIndexOffsetHelper;
-    FOR_RANGE(size_t, i, 0, dy_num_dims) {
-      params.dest_dims[i] = dy_dims[i];
-      params.stride[i] = stride[i];
-    }
-    params.dest_num_dims = dy_num_dims;
-    params.storage_offset = storage_offset;
-    params.input_num = dx_num;
-    params.output_num = dy_num;
-
-    AsStridedGrad_kernel<T>
-        <<<BlocksNum4ThreadsNum(dy_num), kCudaThreadsNumPerBlock, 0,
-           stream->As<ep::CudaStream>()->cuda_stream()>>>(dy_buf, dx_buf, params);
-  }
-};
-
-}  // namespace
-
-template<typename T>
-class GpuAsStridedKernel final : public user_op::OpKernel {
- public:
-  GpuAsStridedKernel() = default;
-  ~GpuAsStridedKernel() = default;
-
- private:
-  using user_op::OpKernel::Compute;
-  void Compute(user_op::KernelComputeContext* ctx) const override {
-    const user_op::Tensor* input = ctx->Tensor4ArgNameAndIndex("input", 0);
-    user_op::Tensor* output = ctx->Tensor4ArgNameAndIndex("output", 0);
-    const auto size = ctx->Attr<std::vector<int32_t>>("size");
-    const auto stride = ctx->Attr<std::vector<int32_t>>("stride");
-    const int32_t storage_offset = ctx->Attr<int32_t>("storage_offset");
-
-    size_t dest_num_dims = output->shape_view().NumAxes();
-    const int64_t* dest_dims = output->shape_view().ptr();
-    const size_t input_num = input->shape_view().Count(0);
-    const size_t output_num = output->shape_view().Count(0);
-    if (input_num == 0) {
-      // 0-size tensor
-      return;
-    }
-
-    AsStridedFunctor<T>()(ctx->stream(), input->dptr<T>(), output->mut_dptr<T>(), dest_dims,
-                          stride.data(), dest_num_dims, storage_offset, input_num, output_num);
-  }
-  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
-};
-
-template<typename T>
-class GpuAsStridedGradKernel final : public user_op::OpKernel {
- public:
-  GpuAsStridedGradKernel() = default;
-  ~GpuAsStridedGradKernel() = default;
-
- private:
-  using user_op::OpKernel::Compute;
-  void Compute(user_op::KernelComputeContext* ctx) const override {
-    const user_op::Tensor* dy = ctx->Tensor4ArgNameAndIndex("dy", 0);
-    user_op::Tensor* dx = ctx->Tensor4ArgNameAndIndex("dx", 0);
-    const auto size = ctx->Attr<std::vector<int32_t>>("size");
-    const auto stride = ctx->Attr<std::vector<int32_t>>("stride");
-    const int32_t storage_offset = ctx->Attr<int32_t>("storage_offset");
-
-    size_t dy_num_dims = dy->shape_view().NumAxes();
-    const int64_t* dy_dims = dy->shape_view().ptr();
-    const size_t dx_num = dx->shape_view().Count(0);
-    const size_t dy_num = dy->shape_view().Count(0);
-
-    Memset<DeviceType::kCUDA>(ctx->stream(), dx->mut_dptr(), 0,
-                              dx->shape_view().Count(0) * sizeof(T));
-
-    AsStridedGradFunctor<T>()(ctx->stream(), dy->dptr<T>(), dx->mut_dptr<T>(), dy_dims,
-                              stride.data(), dy_num_dims, storage_offset, dx_num, dy_num);
-  }
-  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
-};
-
-#define REGISTER_GPUASSTRIDED_KERNEL(in_type)                                                 \
-  REGISTER_USER_KERNEL("as_strided")                                                          \
-      .SetCreateFn<GpuAsStridedKernel<in_type>>()                                             \
-      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA)                        \
-                       && (user_op::HobDataType("input", 0) == GetDataType<in_type>::value)); \
-  REGISTER_USER_KERNEL("as_strided_grad")                                                     \
-      .SetCreateFn<GpuAsStridedGradKernel<in_type>>()                                         \
-      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA)                        \
-                       && (user_op::HobDataType("input", 0) == GetDataType<in_type>::value));
-
-REGISTER_GPUASSTRIDED_KERNEL(half);
-REGISTER_GPUASSTRIDED_KERNEL(float);
-REGISTER_GPUASSTRIDED_KERNEL(double);
-REGISTER_GPUASSTRIDED_KERNEL(int64_t);
-
-#undef REGISTER_GPUASSTRIDED_KERNEL
-
+#include "hip/hip_runtime.h"
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+#include <cstdint>
+#include "oneflow/core/hip/atomic.hip.h"
+#include "oneflow/core/common/just.h"
+#include "oneflow/core/common/util.h"
+#include "oneflow/core/framework/consistency_check.h"
+#include "oneflow/core/framework/framework.h"
+#include "oneflow/core/kernel/new_kernel_util.h"
+#include "oneflow/core/kernel/kernel_util.h"
+#include "oneflow/core/ep/rocm/cuda_stream.h"
+#include "oneflow/core/common/nd_index_offset_helper.h"
+
+namespace oneflow {
+
+namespace {
+
+constexpr size_t NUM_DIM = 8;
+
+template<size_t num_dims, typename IndexType>
+struct AsStridedParams {
+  NdIndexOffsetHelper<IndexType, num_dims> destIndexOffsetHelper;
+  int64_t dest_dims[num_dims];
+  int32_t stride[num_dims];
+  int32_t dest_num_dims;
+  int32_t storage_offset;
+  int32_t input_num;
+  int32_t output_num;
+};
+
+template<typename T>
+__global__ void AsStrided_kernel(const T* input_buf, T* output_buf,
+                                 AsStridedParams<NUM_DIM, int64_t> params) {
+  const int64_t* dest_dims = reinterpret_cast<const int64_t*>(params.dest_dims);
+  const int32_t* stride = reinterpret_cast<const int32_t*>(params.stride);
+
+  CUDA_1D_KERNEL_LOOP_T(int64_t, i, params.output_num) {
+    int64_t dst_index[NUM_DIM];
+    params.destIndexOffsetHelper.OffsetToNdIndex(i, dst_index, params.dest_num_dims);
+    int32_t index_in_input = params.storage_offset;
+    FOR_RANGE(int64_t, j, 0, params.dest_num_dims) { index_in_input += dst_index[j] * stride[j]; }
+    output_buf[i] = input_buf[index_in_input];
+  }
+}
+
+template<typename T>
+__global__ void AsStridedGrad_kernel(const T* dy_buf, T* dx_buf,
+                                     AsStridedParams<NUM_DIM, int64_t> params) {
+  const int64_t* dest_dims = reinterpret_cast<const int64_t*>(params.dest_dims);
+  const int32_t* stride = reinterpret_cast<const int32_t*>(params.stride);
+  CUDA_1D_KERNEL_LOOP_T(int64_t, i, params.output_num) {
+    int64_t dy_index[NUM_DIM];
+    params.destIndexOffsetHelper.OffsetToNdIndex(i, dy_index, params.dest_num_dims);
+    int32_t index_in_dx = params.storage_offset;
+    FOR_RANGE(int64_t, j, 0, params.dest_num_dims) { index_in_dx += dy_index[j] * stride[j]; }
+    cuda::atomic::Add(dx_buf + index_in_dx, dy_buf[i]);
+  }
+}
+
+template<typename T>
+struct AsStridedFunctor final {
+  void operator()(ep::Stream* stream, const T* input_buf, T* output_buf, const int64_t* dest_dims,
+                  const int32_t* stride, const int32_t dest_num_dims, const int32_t storage_offset,
+                  const int32_t input_num, const int32_t output_num) {
+    NdIndexOffsetHelper<int64_t, NUM_DIM> destIndexOffsetHelper(dest_dims, dest_num_dims);
+    AsStridedParams<NUM_DIM, int64_t> params;
+    params.destIndexOffsetHelper = destIndexOffsetHelper;
+    FOR_RANGE(size_t, i, 0, dest_num_dims) {
+      params.dest_dims[i] = dest_dims[i];
+      params.stride[i] = stride[i];
+    }
+    params.dest_num_dims = dest_num_dims;
+    params.storage_offset = storage_offset;
+    params.input_num = input_num;
+    params.output_num = output_num;
+
+    AsStrided_kernel<T>
+        <<<BlocksNum4ThreadsNum(output_num), kCudaThreadsNumPerBlock, 0,
+           stream->As<ep::CudaStream>()->cuda_stream()>>>(input_buf, output_buf, params);
+  }
+};
+
+template<typename T>
+struct AsStridedGradFunctor final {
+  void operator()(ep::Stream* stream, const T* dy_buf, T* dx_buf, const int64_t* dy_dims,
+                  const int32_t* stride, const int32_t dy_num_dims, const int32_t storage_offset,
+                  const int32_t dx_num, const int32_t dy_num) {
+    NdIndexOffsetHelper<int64_t, NUM_DIM> dyIndexOffsetHelper(dy_dims, dy_num_dims);
+    AsStridedParams<NUM_DIM, int64_t> params;
+    params.destIndexOffsetHelper = dyIndexOffsetHelper;
+    FOR_RANGE(size_t, i, 0, dy_num_dims) {
+      params.dest_dims[i] = dy_dims[i];
+      params.stride[i] = stride[i];
+    }
+    params.dest_num_dims = dy_num_dims;
+    params.storage_offset = storage_offset;
+    params.input_num = dx_num;
+    params.output_num = dy_num;
+
+    AsStridedGrad_kernel<T>
+        <<<BlocksNum4ThreadsNum(dy_num), kCudaThreadsNumPerBlock, 0,
+           stream->As<ep::CudaStream>()->cuda_stream()>>>(dy_buf, dx_buf, params);
+  }
+};
+
+}  // namespace
+
+template<typename T>
+class GpuAsStridedKernel final : public user_op::OpKernel {
+ public:
+  GpuAsStridedKernel() = default;
+  ~GpuAsStridedKernel() = default;
+
+ private:
+  using user_op::OpKernel::Compute;
+  void Compute(user_op::KernelComputeContext* ctx) const override {
+    const user_op::Tensor* input = ctx->Tensor4ArgNameAndIndex("input", 0);
+    user_op::Tensor* output = ctx->Tensor4ArgNameAndIndex("output", 0);
+    const auto size = ctx->Attr<std::vector<int32_t>>("size");
+    const auto stride = ctx->Attr<std::vector<int32_t>>("stride");
+    const int32_t storage_offset = ctx->Attr<int32_t>("storage_offset");
+
+    size_t dest_num_dims = output->shape_view().NumAxes();
+    const int64_t* dest_dims = output->shape_view().ptr();
+    const size_t input_num = input->shape_view().Count(0);
+    const size_t output_num = output->shape_view().Count(0);
+    if (input_num == 0) {
+      // 0-size tensor
+      return;
+    }
+
+    AsStridedFunctor<T>()(ctx->stream(), input->dptr<T>(), output->mut_dptr<T>(), dest_dims,
+                          stride.data(), dest_num_dims, storage_offset, input_num, output_num);
+  }
+  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
+};
+
+template<typename T>
+class GpuAsStridedGradKernel final : public user_op::OpKernel {
+ public:
+  GpuAsStridedGradKernel() = default;
+  ~GpuAsStridedGradKernel() = default;
+
+ private:
+  using user_op::OpKernel::Compute;
+  void Compute(user_op::KernelComputeContext* ctx) const override {
+    const user_op::Tensor* dy = ctx->Tensor4ArgNameAndIndex("dy", 0);
+    user_op::Tensor* dx = ctx->Tensor4ArgNameAndIndex("dx", 0);
+    const auto size = ctx->Attr<std::vector<int32_t>>("size");
+    const auto stride = ctx->Attr<std::vector<int32_t>>("stride");
+    const int32_t storage_offset = ctx->Attr<int32_t>("storage_offset");
+
+    size_t dy_num_dims = dy->shape_view().NumAxes();
+    const int64_t* dy_dims = dy->shape_view().ptr();
+    const size_t dx_num = dx->shape_view().Count(0);
+    const size_t dy_num = dy->shape_view().Count(0);
+
+    Memset<DeviceType::kCUDA>(ctx->stream(), dx->mut_dptr(), 0,
+                              dx->shape_view().Count(0) * sizeof(T));
+
+    AsStridedGradFunctor<T>()(ctx->stream(), dy->dptr<T>(), dx->mut_dptr<T>(), dy_dims,
+                              stride.data(), dy_num_dims, storage_offset, dx_num, dy_num);
+  }
+  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
+};
+
+#define REGISTER_GPUASSTRIDED_KERNEL(in_type)                                                 \
+  REGISTER_USER_KERNEL("as_strided")                                                          \
+      .SetCreateFn<GpuAsStridedKernel<in_type>>()                                             \
+      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA)                        \
+                       && (user_op::HobDataType("input", 0) == GetDataType<in_type>::value)); \
+  REGISTER_USER_KERNEL("as_strided_grad")                                                     \
+      .SetCreateFn<GpuAsStridedGradKernel<in_type>>()                                         \
+      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA)                        \
+                       && (user_op::HobDataType("input", 0) == GetDataType<in_type>::value));
+
+REGISTER_GPUASSTRIDED_KERNEL(half);
+REGISTER_GPUASSTRIDED_KERNEL(float);
+REGISTER_GPUASSTRIDED_KERNEL(double);
+REGISTER_GPUASSTRIDED_KERNEL(int64_t);
+
+#undef REGISTER_GPUASSTRIDED_KERNEL
+
 }  // namespace oneflow
\ No newline at end of file
--- a/oneflow/user/kernels/assign_if_kernel.hip.cpp
+++ b/oneflow/user/kernels/assign_if_kernel.hip.cpp
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#include "hip/hip_runtime.h"
-#include "oneflow/core/framework/framework.h"
-#include "oneflow/core/kernel/kernel_util.h"
-#include "oneflow/core/ep/rocm/cuda_stream.h"
-
-namespace oneflow {
-
-namespace {
-
-template<bool assign_if, typename C, typename T>
-__global__ void AssignGpu(int64_t elem_cnt, const C* condition, const T* value, T* ref) {
-  if (assign_if == (*condition == 0)) { return; }
-  CUDA_1D_KERNEL_LOOP(i, elem_cnt) { ref[i] = value[i]; }
-}
-
-template<bool assign_if, typename C, typename T>
-class AssignIfGPUKernel final : public user_op::OpKernel {
- public:
-  AssignIfGPUKernel() = default;
-  ~AssignIfGPUKernel() override = default;
-
- private:
-  using user_op::OpKernel::Compute;
-  void Compute(user_op::KernelComputeContext* ctx) const override {
-    const user_op::Tensor* condition = ctx->Tensor4ArgNameAndIndex("condition", 0);
-    CHECK_EQ(condition->shape_view().NumAxes(), 1);
-    CHECK_EQ(condition->shape_view().At(0), 1);
-    const user_op::Tensor* value = ctx->Tensor4ArgNameAndIndex("value", 0);
-    user_op::Tensor* ref = ctx->Tensor4ArgNameAndIndex("ref", 0);
-    if (value->dptr() == ref->dptr()) { return; }
-    CHECK_EQ(value->shape_view(), ref->shape_view());
-    CHECK_EQ(value->data_type(), ref->data_type());
-    const size_t elem_cnt = ref->shape_view().elem_cnt();
-    AssignGpu<assign_if, C, T><<<BlocksNum4ThreadsNum(elem_cnt), kCudaThreadsNumPerBlock, 0,
-                                 ctx->stream()->As<ep::CudaStream>()->cuda_stream()>>>(
-        elem_cnt, condition->dptr<C>(), value->dptr<T>(), ref->mut_dptr<T>());
-  }
-  bool AlwaysComputeWhenAllOutputsEmpty() const override { return true; }
-};
-
-}  // namespace
-
-#define REGISTER_ASSIGN_WITH_CONDITION_VALUE_CUDA_KERNEL(op_type_name, assign_if, condition_type, \
-                                                         value_type)                              \
-  REGISTER_USER_KERNEL(op_type_name)                                                              \
-      .SetCreateFn<AssignIfGPUKernel<assign_if, condition_type, value_type>>()                    \
-      .SetIsMatchedHob(                                                                           \
-          (user_op::HobDeviceType() == DeviceType::kCUDA)                                         \
-          && (user_op::HobDataType("condition", 0) == GetDataType<condition_type>::value)         \
-          && (user_op::HobDataType("value", 0) == GetDataType<value_type>::value));
-
-#define REGISTER_ASSIGN_IF_CUDA_KERNEL(condition_type, value_type)                        \
-  REGISTER_ASSIGN_WITH_CONDITION_VALUE_CUDA_KERNEL(                                       \
-      "assign_if", true, OF_PP_PAIR_FIRST(condition_type), OF_PP_PAIR_FIRST(value_type)); \
-  REGISTER_ASSIGN_WITH_CONDITION_VALUE_CUDA_KERNEL(                                       \
-      "assign_if_not", false, OF_PP_PAIR_FIRST(condition_type), OF_PP_PAIR_FIRST(value_type))
-
-OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(REGISTER_ASSIGN_IF_CUDA_KERNEL, INT_DATA_TYPE_SEQ,
-                                 POD_DATA_TYPE_SEQ)
-
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "hip/hip_runtime.h"
+#include "oneflow/core/framework/framework.h"
+#include "oneflow/core/kernel/kernel_util.h"
+#include "oneflow/core/ep/rocm/cuda_stream.h"
+
+namespace oneflow {
+
+namespace {
+
+template<bool assign_if, typename C, typename T>
+__global__ void AssignGpu(int64_t elem_cnt, const C* condition, const T* value, T* ref) {
+  if (assign_if == (*condition == 0)) { return; }
+  CUDA_1D_KERNEL_LOOP(i, elem_cnt) { ref[i] = value[i]; }
+}
+
+template<bool assign_if, typename C, typename T>
+class AssignIfGPUKernel final : public user_op::OpKernel {
+ public:
+  AssignIfGPUKernel() = default;
+  ~AssignIfGPUKernel() override = default;
+
+ private:
+  using user_op::OpKernel::Compute;
+  void Compute(user_op::KernelComputeContext* ctx) const override {
+    const user_op::Tensor* condition = ctx->Tensor4ArgNameAndIndex("condition", 0);
+    CHECK_EQ(condition->shape_view().NumAxes(), 1);
+    CHECK_EQ(condition->shape_view().At(0), 1);
+    const user_op::Tensor* value = ctx->Tensor4ArgNameAndIndex("value", 0);
+    user_op::Tensor* ref = ctx->Tensor4ArgNameAndIndex("ref", 0);
+    if (value->dptr() == ref->dptr()) { return; }
+    CHECK_EQ(value->shape_view(), ref->shape_view());
+    CHECK_EQ(value->data_type(), ref->data_type());
+    const size_t elem_cnt = ref->shape_view().elem_cnt();
+    AssignGpu<assign_if, C, T><<<BlocksNum4ThreadsNum(elem_cnt), kCudaThreadsNumPerBlock, 0,
+                                 ctx->stream()->As<ep::CudaStream>()->cuda_stream()>>>(
+        elem_cnt, condition->dptr<C>(), value->dptr<T>(), ref->mut_dptr<T>());
+  }
+  bool AlwaysComputeWhenAllOutputsEmpty() const override { return true; }
+};
+
+}  // namespace
+
+#define REGISTER_ASSIGN_WITH_CONDITION_VALUE_CUDA_KERNEL(op_type_name, assign_if, condition_type, \
+                                                         value_type)                              \
+  REGISTER_USER_KERNEL(op_type_name)                                                              \
+      .SetCreateFn<AssignIfGPUKernel<assign_if, condition_type, value_type>>()                    \
+      .SetIsMatchedHob(                                                                           \
+          (user_op::HobDeviceType() == DeviceType::kCUDA)                                         \
+          && (user_op::HobDataType("condition", 0) == GetDataType<condition_type>::value)         \
+          && (user_op::HobDataType("value", 0) == GetDataType<value_type>::value));
+
+#define REGISTER_ASSIGN_IF_CUDA_KERNEL(condition_type, value_type)                        \
+  REGISTER_ASSIGN_WITH_CONDITION_VALUE_CUDA_KERNEL(                                       \
+      "assign_if", true, OF_PP_PAIR_FIRST(condition_type), OF_PP_PAIR_FIRST(value_type)); \
+  REGISTER_ASSIGN_WITH_CONDITION_VALUE_CUDA_KERNEL(                                       \
+      "assign_if_not", false, OF_PP_PAIR_FIRST(condition_type), OF_PP_PAIR_FIRST(value_type))
+
+OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(REGISTER_ASSIGN_IF_CUDA_KERNEL, INT_DATA_TYPE_SEQ,
+                                 POD_DATA_TYPE_SEQ)
+
 }  // namespace oneflow
\ No newline at end of file
--- a/oneflow/user/kernels/avg_pool_kernel.hip.cpp
+++ b/oneflow/user/kernels/avg_pool_kernel.hip.cpp
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#include "hip/hip_runtime.h"
-#include <cstdint>
-#include "oneflow/core/hip/elementwise.hip.h"
-#include "oneflow/user/kernels/avg_pool_kernel_util.h"
-#include "oneflow/core/ep/rocm/cuda_stream.h"
-
-namespace oneflow {
-
-namespace {
-
-constexpr int kBlockSize = cuda::elementwise::kBlockSize;
-
-int GetMinThreadNum(const int64_t elem_num) { return std::min<int64_t>(elem_num, kBlockSize); }
-
-int GetNumBlocks(int32_t elem_cnt) {
-  int num_blocks = 0;
-  OF_CUDA_CHECK(cuda::elementwise::GetNumBlocks(elem_cnt, &num_blocks));
-  return num_blocks;
-}
-
-}  // namespace
-
-template<typename T, typename IDX>
-__launch_bounds__(kBlockSize) __global__
-    void DoCUDAAvgPool1dForward(const NdIndexOffsetHelper<IDX, 2> index_helper, IDX elem_num,
-                                const T* src, T* dest, int32_t padding_l, const int32_t n_batch,
-                                const int32_t n_channel, const int32_t x_length,
-                                const int32_t kernel_size_l, const int32_t stride_l,
-                                const bool count_include_pad, const int32_t divisor_override) {
-  Avgpool1dForwardCompute<T>(index_helper, elem_num, src, dest, padding_l, n_batch, n_channel,
-                             x_length, kernel_size_l, stride_l, count_include_pad,
-                             divisor_override);
-};
-
-template<typename T, typename IDX>
-__launch_bounds__(kBlockSize) __global__
-    void DoCUDAAvgPool2dForward(const NdIndexOffsetHelper<IDX, 3> index_helper, IDX elem_num,
-                                const T* src, T* dest, const int32_t padding_h,
-                                const int32_t padding_w, const int32_t n_batch,
-                                const int32_t n_channel, const int32_t x_height,
-                                const int32_t x_width, const int32_t kernel_size_h,
-                                const int32_t kernel_size_w, const int32_t stride_h,
-                                const int32_t stride_w, const bool count_include_pad,
-                                const int32_t divisor_override) {
-  Avgpool2dForwardCompute<T>(index_helper, elem_num, src, dest, padding_h, padding_w, n_batch,
-                             n_channel, x_height, x_width, kernel_size_h, kernel_size_w, stride_h,
-                             stride_w, count_include_pad, divisor_override);
-};
-
-template<typename T, typename IDX>
-__launch_bounds__(kBlockSize) __global__
-    void DoCUDAAvgPool3dForward(const NdIndexOffsetHelper<IDX, 4> index_helper, IDX elem_num,
-                                const T* src, T* dest, int32_t padding_t, const int32_t padding_h,
-                                const int32_t padding_w, const int32_t n_batch,
-                                const int32_t n_channel, const int32_t x_time,
-                                const int32_t x_height, const int32_t x_width,
-                                const int32_t kernel_size_t, int32_t kernel_size_h,
-                                const int32_t kernel_size_w, const int32_t stride_t,
-                                const int32_t stride_h, const int32_t stride_w,
-                                const bool count_include_pad, const int32_t divisor_override) {
-  Avgpool3dForwardCompute<T>(index_helper, elem_num, src, dest, padding_t, padding_h, padding_w,
-                             n_batch, n_channel, x_time, x_height, x_width, kernel_size_t,
-                             kernel_size_h, kernel_size_w, stride_t, stride_h, stride_w,
-                             count_include_pad, divisor_override);
-};
-
-template<typename T, typename IDX>
-__launch_bounds__(kBlockSize) __global__
-    void DoCUDAAvgPool1dBackward(const NdIndexOffsetHelper<IDX, 2> index_helper, IDX elem_num,
-                                 const T* src, T* dest, const int32_t padding_l,
-                                 const int32_t n_batch, const int32_t n_channel,
-                                 const int32_t input_length, const int32_t kernel_size_l,
-                                 const int32_t stride_l, const bool count_include_pad,
-                                 const int32_t divisor_override) {
-  Avgpool1dBackwardCompute<T>(index_helper, elem_num, src, dest, padding_l, n_batch, n_channel,
-                              input_length, kernel_size_l, stride_l, count_include_pad,
-                              divisor_override);
-};
-
-template<typename T, typename IDX>
-__launch_bounds__(kBlockSize) __global__
-    void DoCUDAAvgPool2dBackward(const NdIndexOffsetHelper<IDX, 3> index_helper, IDX elem_num,
-                                 const T* src, T* dest, const int32_t padding_h,
-                                 const int32_t padding_w, const int32_t n_batch,
-                                 const int32_t n_channel, const int32_t input_height,
-                                 const int32_t input_width, const int32_t kernel_size_h,
-                                 const int32_t kernel_size_w, const int32_t stride_h,
-                                 const int32_t stride_w, const bool count_include_pad,
-                                 int32_t divisor_override) {
-  Avgpool2dBackwardCompute<T>(index_helper, elem_num, src, dest, padding_h, padding_w, n_batch,
-                              n_channel, input_height, input_width, kernel_size_h, kernel_size_w,
-                              stride_h, stride_w, count_include_pad, divisor_override);
-};
-
-template<typename T, typename IDX>
-__launch_bounds__(kBlockSize) __global__ void DoCUDAAvgPool3dBackward(
-    const NdIndexOffsetHelper<IDX, 4> index_helper, IDX elem_num, const T* src, T* dest,
-    const int32_t padding_t, const int32_t padding_h, const int32_t padding_w,
-    const int32_t n_batch, const int32_t n_channel, const int32_t x_time, const int32_t x_height,
-    const int32_t x_width, const int32_t kernel_size_t, const int32_t kernel_size_h,
-    const int32_t kernel_size_w, const int32_t stride_t, const int32_t stride_h,
-    const int32_t stride_w, const bool count_include_pad, const int32_t divisor_override) {
-  Avgpool3dBackwardCompute<T>(index_helper, elem_num, src, dest, padding_t, padding_h, padding_w,
-                              n_batch, n_channel, x_time, x_height, x_width, kernel_size_t,
-                              kernel_size_h, kernel_size_w, stride_t, stride_h, stride_w,
-                              count_include_pad, divisor_override);
-};
-
-template<typename T, typename IDX>
-struct AvgPoolKernelUtil<DeviceType::kCUDA, T, IDX> {
-  static void Avgpool1dForward(ep::Stream* stream, const NdIndexOffsetHelper<IDX, 2>& index_helper,
-                               const IDX elem_num, const T* src, T* dest,
-                               const AvgPoolParams3D& params_3d) {
-    DoCUDAAvgPool1dForward<T, IDX><<<GetNumBlocks(elem_num), GetMinThreadNum(elem_num), 0,
-                                     stream->As<ep::CudaStream>()->cuda_stream()>>>(
-        index_helper, elem_num, src, dest, params_3d.padding()[2], params_3d.num_batch(),
-        params_3d.num_channel(), params_3d.GetXShape5D().At(4), params_3d.pool_size_3d()[2],
-        params_3d.stride_3d()[2], params_3d.count_include_pad(), params_3d.divisor_override());
-  }
-
-  static void Avgpool1dBackward(ep::Stream* stream, const NdIndexOffsetHelper<IDX, 2>& index_helper,
-                                const IDX elem_num, const T* src, T* dest,
-                                const AvgPoolParams3D& params_3d) {
-    DoCUDAAvgPool1dBackward<T, IDX><<<GetNumBlocks(elem_num), GetMinThreadNum(elem_num), 0,
-                                      stream->As<ep::CudaStream>()->cuda_stream()>>>(
-        index_helper, elem_num, src, dest, params_3d.padding()[2], params_3d.num_batch(),
-        params_3d.num_channel(), params_3d.GetXShape5D().At(4), params_3d.pool_size_3d()[2],
-        params_3d.stride_3d()[2], params_3d.count_include_pad(), params_3d.divisor_override());
-  }
-
-  static void Avgpool2dForward(ep::Stream* stream, const NdIndexOffsetHelper<IDX, 3>& index_helper,
-                               const IDX elem_num, const T* src, T* dest,
-                               const AvgPoolParams3D& params_3d) {
-    DoCUDAAvgPool2dForward<T, IDX><<<GetNumBlocks(elem_num), GetMinThreadNum(elem_num), 0,
-                                     stream->As<ep::CudaStream>()->cuda_stream()>>>(
-        index_helper, elem_num, src, dest, params_3d.padding()[1], params_3d.padding()[2],
-        params_3d.num_batch(), params_3d.num_channel(), params_3d.GetXShape5D().At(3),
-        params_3d.GetXShape5D().At(4), params_3d.pool_size_3d()[1], params_3d.pool_size_3d()[2],
-        params_3d.stride_3d()[1], params_3d.stride_3d()[2], params_3d.count_include_pad(),
-        params_3d.divisor_override());
-  }
-
-  static void Avgpool2dBackward(ep::Stream* stream, const NdIndexOffsetHelper<IDX, 3>& index_helper,
-                                const IDX elem_num, const T* src, T* dest,
-                                const AvgPoolParams3D& params_3d) {
-    DoCUDAAvgPool2dBackward<T, IDX><<<GetNumBlocks(elem_num), GetMinThreadNum(elem_num), 0,
-                                      stream->As<ep::CudaStream>()->cuda_stream()>>>(
-        index_helper, elem_num, src, dest, params_3d.padding()[1], params_3d.padding()[2],
-        params_3d.num_batch(), params_3d.num_channel(), params_3d.GetXShape5D().At(3),
-        params_3d.GetXShape5D().At(4), params_3d.pool_size_3d()[1], params_3d.pool_size_3d()[2],
-        params_3d.stride_3d()[1], params_3d.stride_3d()[2], params_3d.count_include_pad(),
-        params_3d.divisor_override());
-  }
-
-  static void Avgpool3dForward(ep::Stream* stream, const NdIndexOffsetHelper<IDX, 4>& index_helper,
-                               const IDX elem_num, const T* src, T* dest,
-                               const AvgPoolParams3D& params_3d) {
-    DoCUDAAvgPool3dForward<T, IDX><<<GetNumBlocks(elem_num), GetMinThreadNum(elem_num), 0,
-                                     stream->As<ep::CudaStream>()->cuda_stream()>>>(
-        index_helper, elem_num, src, dest, params_3d.padding()[0], params_3d.padding()[1],
-        params_3d.padding()[2], params_3d.num_batch(), params_3d.num_channel(),
-        params_3d.GetXShape5D().At(2), params_3d.GetXShape5D().At(3), params_3d.GetXShape5D().At(4),
-        params_3d.pool_size_3d()[0], params_3d.pool_size_3d()[1], params_3d.pool_size_3d()[2],
-        params_3d.stride_3d()[0], params_3d.stride_3d()[1], params_3d.stride_3d()[2],
-        params_3d.count_include_pad(), params_3d.divisor_override());
-  }
-
-  static void Avgpool3dBackward(ep::Stream* stream, const NdIndexOffsetHelper<IDX, 4>& index_helper,
-                                const IDX elem_num, const T* src, T* dest,
-                                const AvgPoolParams3D& params_3d) {
-    DoCUDAAvgPool3dBackward<T, IDX><<<GetNumBlocks(elem_num), GetMinThreadNum(elem_num), 0,
-                                      stream->As<ep::CudaStream>()->cuda_stream()>>>(
-        index_helper, elem_num, src, dest, params_3d.padding()[0], params_3d.padding()[1],
-        params_3d.padding()[2], params_3d.num_batch(), params_3d.num_channel(),
-        params_3d.GetXShape5D().At(2), params_3d.GetXShape5D().At(3), params_3d.GetXShape5D().At(4),
-        params_3d.pool_size_3d()[0], params_3d.pool_size_3d()[1], params_3d.pool_size_3d()[2],
-        params_3d.stride_3d()[0], params_3d.stride_3d()[1], params_3d.stride_3d()[2],
-        params_3d.count_include_pad(), params_3d.divisor_override());
-  }
-};
-
-OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INSTANTIATE_AVG_POOL_KERNEL_UTIL, (DeviceType::kCUDA),
-                                 AVG_POOL_DATA_TYPE_CUDA_SEQ, AVG_POOL_IDX_DATA_TYPE_SEQ);
-
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "hip/hip_runtime.h"
+#include <cstdint>
+#include "oneflow/core/hip/elementwise.hip.h"
+#include "oneflow/user/kernels/avg_pool_kernel_util.h"
+#include "oneflow/core/ep/rocm/cuda_stream.h"
+
+namespace oneflow {
+
+namespace {
+
+constexpr int kBlockSize = cuda::elementwise::kBlockSize;
+
+int GetMinThreadNum(const int64_t elem_num) { return std::min<int64_t>(elem_num, kBlockSize); }
+
+int GetNumBlocks(int32_t elem_cnt) {
+  int num_blocks = 0;
+  OF_CUDA_CHECK(cuda::elementwise::GetNumBlocks(elem_cnt, &num_blocks));
+  return num_blocks;
+}
+
+}  // namespace
+
+template<typename T, typename IDX>
+__launch_bounds__(kBlockSize) __global__
+    void DoCUDAAvgPool1dForward(const NdIndexOffsetHelper<IDX, 2> index_helper, IDX elem_num,
+                                const T* src, T* dest, int32_t padding_l, const int32_t n_batch,
+                                const int32_t n_channel, const int32_t x_length,
+                                const int32_t kernel_size_l, const int32_t stride_l,
+                                const bool count_include_pad, const int32_t divisor_override) {
+  Avgpool1dForwardCompute<T>(index_helper, elem_num, src, dest, padding_l, n_batch, n_channel,
+                             x_length, kernel_size_l, stride_l, count_include_pad,
+                             divisor_override);
+};
+
+template<typename T, typename IDX>
+__launch_bounds__(kBlockSize) __global__
+    void DoCUDAAvgPool2dForward(const NdIndexOffsetHelper<IDX, 3> index_helper, IDX elem_num,
+                                const T* src, T* dest, const int32_t padding_h,
+                                const int32_t padding_w, const int32_t n_batch,
+                                const int32_t n_channel, const int32_t x_height,
+                                const int32_t x_width, const int32_t kernel_size_h,
+                                const int32_t kernel_size_w, const int32_t stride_h,
+                                const int32_t stride_w, const bool count_include_pad,
+                                const int32_t divisor_override) {
+  Avgpool2dForwardCompute<T>(index_helper, elem_num, src, dest, padding_h, padding_w, n_batch,
+                             n_channel, x_height, x_width, kernel_size_h, kernel_size_w, stride_h,
+                             stride_w, count_include_pad, divisor_override);
+};
+
+template<typename T, typename IDX>
+__launch_bounds__(kBlockSize) __global__
+    void DoCUDAAvgPool3dForward(const NdIndexOffsetHelper<IDX, 4> index_helper, IDX elem_num,
+                                const T* src, T* dest, int32_t padding_t, const int32_t padding_h,
+                                const int32_t padding_w, const int32_t n_batch,
+                                const int32_t n_channel, const int32_t x_time,
+                                const int32_t x_height, const int32_t x_width,
+                                const int32_t kernel_size_t, int32_t kernel_size_h,
+                                const int32_t kernel_size_w, const int32_t stride_t,
+                                const int32_t stride_h, const int32_t stride_w,
+                                const bool count_include_pad, const int32_t divisor_override) {
+  Avgpool3dForwardCompute<T>(index_helper, elem_num, src, dest, padding_t, padding_h, padding_w,
+                             n_batch, n_channel, x_time, x_height, x_width, kernel_size_t,
+                             kernel_size_h, kernel_size_w, stride_t, stride_h, stride_w,
+                             count_include_pad, divisor_override);
+};
+
+template<typename T, typename IDX>
+__launch_bounds__(kBlockSize) __global__
+    void DoCUDAAvgPool1dBackward(const NdIndexOffsetHelper<IDX, 2> index_helper, IDX elem_num,
+                                 const T* src, T* dest, const int32_t padding_l,
+                                 const int32_t n_batch, const int32_t n_channel,
+                                 const int32_t input_length, const int32_t kernel_size_l,
+                                 const int32_t stride_l, const bool count_include_pad,
+                                 const int32_t divisor_override) {
+  Avgpool1dBackwardCompute<T>(index_helper, elem_num, src, dest, padding_l, n_batch, n_channel,
+                              input_length, kernel_size_l, stride_l, count_include_pad,
+                              divisor_override);
+};
+
+template<typename T, typename IDX>
+__launch_bounds__(kBlockSize) __global__
+    void DoCUDAAvgPool2dBackward(const NdIndexOffsetHelper<IDX, 3> index_helper, IDX elem_num,
+                                 const T* src, T* dest, const int32_t padding_h,
+                                 const int32_t padding_w, const int32_t n_batch,
+                                 const int32_t n_channel, const int32_t input_height,
+                                 const int32_t input_width, const int32_t kernel_size_h,
+                                 const int32_t kernel_size_w, const int32_t stride_h,
+                                 const int32_t stride_w, const bool count_include_pad,
+                                 int32_t divisor_override) {
+  Avgpool2dBackwardCompute<T>(index_helper, elem_num, src, dest, padding_h, padding_w, n_batch,
+                              n_channel, input_height, input_width, kernel_size_h, kernel_size_w,
+                              stride_h, stride_w, count_include_pad, divisor_override);
+};
+
+template<typename T, typename IDX>
+__launch_bounds__(kBlockSize) __global__ void DoCUDAAvgPool3dBackward(
+    const NdIndexOffsetHelper<IDX, 4> index_helper, IDX elem_num, const T* src, T* dest,
+    const int32_t padding_t, const int32_t padding_h, const int32_t padding_w,
+    const int32_t n_batch, const int32_t n_channel, const int32_t x_time, const int32_t x_height,
+    const int32_t x_width, const int32_t kernel_size_t, const int32_t kernel_size_h,
+    const int32_t kernel_size_w, const int32_t stride_t, const int32_t stride_h,
+    const int32_t stride_w, const bool count_include_pad, const int32_t divisor_override) {
+  Avgpool3dBackwardCompute<T>(index_helper, elem_num, src, dest, padding_t, padding_h, padding_w,
+                              n_batch, n_channel, x_time, x_height, x_width, kernel_size_t,
+                              kernel_size_h, kernel_size_w, stride_t, stride_h, stride_w,
+                              count_include_pad, divisor_override);
+};
+
+template<typename T, typename IDX>
+struct AvgPoolKernelUtil<DeviceType::kCUDA, T, IDX> {
+  static void Avgpool1dForward(ep::Stream* stream, const NdIndexOffsetHelper<IDX, 2>& index_helper,
+                               const IDX elem_num, const T* src, T* dest,
+                               const AvgPoolParams3D& params_3d) {
+    DoCUDAAvgPool1dForward<T, IDX><<<GetNumBlocks(elem_num), GetMinThreadNum(elem_num), 0,
+                                     stream->As<ep::CudaStream>()->cuda_stream()>>>(
+        index_helper, elem_num, src, dest, params_3d.padding()[2], params_3d.num_batch(),
+        params_3d.num_channel(), params_3d.GetXShape5D().At(4), params_3d.pool_size_3d()[2],
+        params_3d.stride_3d()[2], params_3d.count_include_pad(), params_3d.divisor_override());
+  }
+
+  static void Avgpool1dBackward(ep::Stream* stream, const NdIndexOffsetHelper<IDX, 2>& index_helper,
+                                const IDX elem_num, const T* src, T* dest,
+                                const AvgPoolParams3D& params_3d) {
+    DoCUDAAvgPool1dBackward<T, IDX><<<GetNumBlocks(elem_num), GetMinThreadNum(elem_num), 0,
+                                      stream->As<ep::CudaStream>()->cuda_stream()>>>(
+        index_helper, elem_num, src, dest, params_3d.padding()[2], params_3d.num_batch(),
+        params_3d.num_channel(), params_3d.GetXShape5D().At(4), params_3d.pool_size_3d()[2],
+        params_3d.stride_3d()[2], params_3d.count_include_pad(), params_3d.divisor_override());
+  }
+
+  static void Avgpool2dForward(ep::Stream* stream, const NdIndexOffsetHelper<IDX, 3>& index_helper,
+                               const IDX elem_num, const T* src, T* dest,
+                               const AvgPoolParams3D& params_3d) {
+    DoCUDAAvgPool2dForward<T, IDX><<<GetNumBlocks(elem_num), GetMinThreadNum(elem_num), 0,
+                                     stream->As<ep::CudaStream>()->cuda_stream()>>>(
+        index_helper, elem_num, src, dest, params_3d.padding()[1], params_3d.padding()[2],
+        params_3d.num_batch(), params_3d.num_channel(), params_3d.GetXShape5D().At(3),
+        params_3d.GetXShape5D().At(4), params_3d.pool_size_3d()[1], params_3d.pool_size_3d()[2],
+        params_3d.stride_3d()[1], params_3d.stride_3d()[2], params_3d.count_include_pad(),
+        params_3d.divisor_override());
+  }
+
+  static void Avgpool2dBackward(ep::Stream* stream, const NdIndexOffsetHelper<IDX, 3>& index_helper,
+                                const IDX elem_num, const T* src, T* dest,
+                                const AvgPoolParams3D& params_3d) {
+    DoCUDAAvgPool2dBackward<T, IDX><<<GetNumBlocks(elem_num), GetMinThreadNum(elem_num), 0,
+                                      stream->As<ep::CudaStream>()->cuda_stream()>>>(
+        index_helper, elem_num, src, dest, params_3d.padding()[1], params_3d.padding()[2],
+        params_3d.num_batch(), params_3d.num_channel(), params_3d.GetXShape5D().At(3),
+        params_3d.GetXShape5D().At(4), params_3d.pool_size_3d()[1], params_3d.pool_size_3d()[2],
+        params_3d.stride_3d()[1], params_3d.stride_3d()[2], params_3d.count_include_pad(),
+        params_3d.divisor_override());
+  }
+
+  static void Avgpool3dForward(ep::Stream* stream, const NdIndexOffsetHelper<IDX, 4>& index_helper,
+                               const IDX elem_num, const T* src, T* dest,
+                               const AvgPoolParams3D& params_3d) {
+    DoCUDAAvgPool3dForward<T, IDX><<<GetNumBlocks(elem_num), GetMinThreadNum(elem_num), 0,
+                                     stream->As<ep::CudaStream>()->cuda_stream()>>>(
+        index_helper, elem_num, src, dest, params_3d.padding()[0], params_3d.padding()[1],
+        params_3d.padding()[2], params_3d.num_batch(), params_3d.num_channel(),
+        params_3d.GetXShape5D().At(2), params_3d.GetXShape5D().At(3), params_3d.GetXShape5D().At(4),
+        params_3d.pool_size_3d()[0], params_3d.pool_size_3d()[1], params_3d.pool_size_3d()[2],
+        params_3d.stride_3d()[0], params_3d.stride_3d()[1], params_3d.stride_3d()[2],
+        params_3d.count_include_pad(), params_3d.divisor_override());
+  }
+
+  static void Avgpool3dBackward(ep::Stream* stream, const NdIndexOffsetHelper<IDX, 4>& index_helper,
+                                const IDX elem_num, const T* src, T* dest,
+                                const AvgPoolParams3D& params_3d) {
+    DoCUDAAvgPool3dBackward<T, IDX><<<GetNumBlocks(elem_num), GetMinThreadNum(elem_num), 0,
+                                      stream->As<ep::CudaStream>()->cuda_stream()>>>(
+        index_helper, elem_num, src, dest, params_3d.padding()[0], params_3d.padding()[1],
+        params_3d.padding()[2], params_3d.num_batch(), params_3d.num_channel(),
+        params_3d.GetXShape5D().At(2), params_3d.GetXShape5D().At(3), params_3d.GetXShape5D().At(4),
+        params_3d.pool_size_3d()[0], params_3d.pool_size_3d()[1], params_3d.pool_size_3d()[2],
+        params_3d.stride_3d()[0], params_3d.stride_3d()[1], params_3d.stride_3d()[2],
+        params_3d.count_include_pad(), params_3d.divisor_override());
+  }
+};
+
+OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INSTANTIATE_AVG_POOL_KERNEL_UTIL, (DeviceType::kCUDA),
+                                 AVG_POOL_DATA_TYPE_CUDA_SEQ, AVG_POOL_IDX_DATA_TYPE_SEQ);
+
 }  // namespace oneflow
\ No newline at end of file
--- a/oneflow/user/kernels/batch_gather_kernel_util.hip.cpp
+++ b/oneflow/user/kernels/batch_gather_kernel_util.hip.cpp
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#include "hip/hip_runtime.h"
-#include "oneflow/user/kernels/batch_gather_kernel_util.h"
-#include "oneflow/core/hip/atomic.hip.h"
-#include "oneflow/core/ep/rocm/cuda_stream.h"
-#include <assert.h>
-
-namespace oneflow {
-
-namespace {
-
-template<typename K>
-__device__ int64_t GetInOffset(const int64_t out_offset, const K* indices,
-                               const int64_t indices_num, const int64_t instance_size,
-                               const int64_t gather_dim_size) {
-  const int64_t batch_idx = out_offset / (indices_num * instance_size);
-  const int64_t indices_idx = out_offset % (indices_num * instance_size) / instance_size;
-  const int64_t inner_idx = out_offset % instance_size;
-  const int64_t idx = indices[batch_idx * indices_num + indices_idx];
-  assert(idx >= 0 && idx < gather_dim_size);
-  return batch_idx * gather_dim_size * instance_size + idx * instance_size + inner_idx;
-}
-
-template<typename T, typename K>
-__global__ void BatchGatherForwardGpu(const int64_t elem_cnt, const T* in, const K* indices,
-                                      const int64_t indices_num, const int64_t instance_size,
-                                      const int64_t gather_dim_size, T* out) {
-  CUDA_1D_KERNEL_LOOP(i, elem_cnt) {
-    out[i] = in[GetInOffset<K>(i, indices, indices_num, instance_size, gather_dim_size)];
-  }
-}
-
-template<typename T, typename K>
-__global__ void BatchGatherBackwardGpu(const int64_t elem_cnt, const T* out_diff, const K* indices,
-                                       const int64_t indices_num, const int64_t instance_size,
-                                       const int64_t gather_dim_size, T* in_diff) {
-  CUDA_1D_KERNEL_LOOP(i, elem_cnt) {
-    cuda::atomic::Add(
-        in_diff + GetInOffset<K>(i, indices, indices_num, instance_size, gather_dim_size),
-        out_diff[i]);
-  }
-}
-
-}  // namespace
-
-template<typename T, typename K>
-struct BatchGatherKernelUtilImpl<DeviceType::kCUDA, T, K> final {
-  static void Forward(ep::Stream* stream, const T* in, const K* indices,
-                      const Shape& flat_out_shape, const int64_t gather_dim_size, T* out);
-  static void Backward(ep::Stream* stream, const T* out_diff, const K* indices,
-                       const Shape& flat_out_diff_shape, const int64_t gather_dim_size, T* in_diff);
-};
-
-template<typename T, typename K>
-void BatchGatherKernelUtilImpl<DeviceType::kCUDA, T, K>::Forward(ep::Stream* stream, const T* in,
-                                                                 const K* indices,
-                                                                 const Shape& flat_out_shape,
-                                                                 const int64_t gather_dim_size,
-                                                                 T* out) {
-  const int64_t batch_num = flat_out_shape.At(0);
-  const int64_t indices_num = flat_out_shape.At(1);
-  const int64_t instance_size = flat_out_shape.At(2);
-  const int64_t elem_cnt = batch_num * indices_num * instance_size;
-  BatchGatherForwardGpu<T, K><<<BlocksNum4ThreadsNum(elem_cnt), kCudaThreadsNumPerBlock, 0,
-                                stream->As<ep::CudaStream>()->cuda_stream()>>>(
-      elem_cnt, in, indices, indices_num, instance_size, gather_dim_size, out);
-}
-
-template<typename T, typename K>
-void BatchGatherKernelUtilImpl<DeviceType::kCUDA, T, K>::Backward(
-    ep::Stream* stream, const T* out_diff, const K* indices, const Shape& flat_out_diff_shape,
-    const int64_t gather_dim_size, T* in_diff) {
-  const int64_t batch_num = flat_out_diff_shape.At(0);
-  const int64_t indices_num = flat_out_diff_shape.At(1);
-  const int64_t instance_size = flat_out_diff_shape.At(2);
-  const int64_t elem_cnt = batch_num * indices_num * instance_size;
-  BatchGatherBackwardGpu<T, K><<<BlocksNum4ThreadsNum(elem_cnt), kCudaThreadsNumPerBlock, 0,
-                                 stream->As<ep::CudaStream>()->cuda_stream()>>>(
-      elem_cnt, out_diff, indices, indices_num, instance_size, gather_dim_size, in_diff);
-}
-
-#define INSTANTIATE_BATCH_GATHER_KERNEL_UTIL_IMPL_CUDA(in_type_pair, index_type_pair)          \
-  template struct BatchGatherKernelUtilImpl<DeviceType::kCUDA, OF_PP_PAIR_FIRST(in_type_pair), \
-                                            OF_PP_PAIR_FIRST(index_type_pair)>;
-OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INSTANTIATE_BATCH_GATHER_KERNEL_UTIL_IMPL_CUDA,
-                                 FLOATING_DATA_TYPE_SEQ, INT_DATA_TYPE_SEQ);
-#undef INSTANTIATE_BATCH_GATHER_KERNEL_UTIL_IMPL_CUDA
-
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "hip/hip_runtime.h"
+#include "oneflow/user/kernels/batch_gather_kernel_util.h"
+#include "oneflow/core/hip/atomic.hip.h"
+#include "oneflow/core/ep/rocm/cuda_stream.h"
+#include <assert.h>
+
+namespace oneflow {
+
+namespace {
+
+template<typename K>
+__device__ int64_t GetInOffset(const int64_t out_offset, const K* indices,
+                               const int64_t indices_num, const int64_t instance_size,
+                               const int64_t gather_dim_size) {
+  const int64_t batch_idx = out_offset / (indices_num * instance_size);
+  const int64_t indices_idx = out_offset % (indices_num * instance_size) / instance_size;
+  const int64_t inner_idx = out_offset % instance_size;
+  const int64_t idx = indices[batch_idx * indices_num + indices_idx];
+  assert(idx >= 0 && idx < gather_dim_size);
+  return batch_idx * gather_dim_size * instance_size + idx * instance_size + inner_idx;
+}
+
+template<typename T, typename K>
+__global__ void BatchGatherForwardGpu(const int64_t elem_cnt, const T* in, const K* indices,
+                                      const int64_t indices_num, const int64_t instance_size,
+                                      const int64_t gather_dim_size, T* out) {
+  CUDA_1D_KERNEL_LOOP(i, elem_cnt) {
+    out[i] = in[GetInOffset<K>(i, indices, indices_num, instance_size, gather_dim_size)];
+  }
+}
+
+template<typename T, typename K>
+__global__ void BatchGatherBackwardGpu(const int64_t elem_cnt, const T* out_diff, const K* indices,
+                                       const int64_t indices_num, const int64_t instance_size,
+                                       const int64_t gather_dim_size, T* in_diff) {
+  CUDA_1D_KERNEL_LOOP(i, elem_cnt) {
+    cuda::atomic::Add(
+        in_diff + GetInOffset<K>(i, indices, indices_num, instance_size, gather_dim_size),
+        out_diff[i]);
+  }
+}
+
+}  // namespace
+
+template<typename T, typename K>
+struct BatchGatherKernelUtilImpl<DeviceType::kCUDA, T, K> final {
+  static void Forward(ep::Stream* stream, const T* in, const K* indices,
+                      const Shape& flat_out_shape, const int64_t gather_dim_size, T* out);
+  static void Backward(ep::Stream* stream, const T* out_diff, const K* indices,
+                       const Shape& flat_out_diff_shape, const int64_t gather_dim_size, T* in_diff);
+};
+
+template<typename T, typename K>
+void BatchGatherKernelUtilImpl<DeviceType::kCUDA, T, K>::Forward(ep::Stream* stream, const T* in,
+                                                                 const K* indices,
+                                                                 const Shape& flat_out_shape,
+                                                                 const int64_t gather_dim_size,
+                                                                 T* out) {
+  const int64_t batch_num = flat_out_shape.At(0);
+  const int64_t indices_num = flat_out_shape.At(1);
+  const int64_t instance_size = flat_out_shape.At(2);
+  const int64_t elem_cnt = batch_num * indices_num * instance_size;
+  BatchGatherForwardGpu<T, K><<<BlocksNum4ThreadsNum(elem_cnt), kCudaThreadsNumPerBlock, 0,
+                                stream->As<ep::CudaStream>()->cuda_stream()>>>(
+      elem_cnt, in, indices, indices_num, instance_size, gather_dim_size, out);
+}
+
+template<typename T, typename K>
+void BatchGatherKernelUtilImpl<DeviceType::kCUDA, T, K>::Backward(
+    ep::Stream* stream, const T* out_diff, const K* indices, const Shape& flat_out_diff_shape,
+    const int64_t gather_dim_size, T* in_diff) {
+  const int64_t batch_num = flat_out_diff_shape.At(0);
+  const int64_t indices_num = flat_out_diff_shape.At(1);
+  const int64_t instance_size = flat_out_diff_shape.At(2);
+  const int64_t elem_cnt = batch_num * indices_num * instance_size;
+  BatchGatherBackwardGpu<T, K><<<BlocksNum4ThreadsNum(elem_cnt), kCudaThreadsNumPerBlock, 0,
+                                 stream->As<ep::CudaStream>()->cuda_stream()>>>(
+      elem_cnt, out_diff, indices, indices_num, instance_size, gather_dim_size, in_diff);
+}
+
+#define INSTANTIATE_BATCH_GATHER_KERNEL_UTIL_IMPL_CUDA(in_type_pair, index_type_pair)          \
+  template struct BatchGatherKernelUtilImpl<DeviceType::kCUDA, OF_PP_PAIR_FIRST(in_type_pair), \
+                                            OF_PP_PAIR_FIRST(index_type_pair)>;
+OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INSTANTIATE_BATCH_GATHER_KERNEL_UTIL_IMPL_CUDA,
+                                 FLOATING_DATA_TYPE_SEQ, INT_DATA_TYPE_SEQ);
+#undef INSTANTIATE_BATCH_GATHER_KERNEL_UTIL_IMPL_CUDA
+
 }  // namespace oneflow
\ No newline at end of file
--- a/oneflow/user/kernels/binary_cross_entropy_kernel.hip.cpp
+++ b/oneflow/user/kernels/binary_cross_entropy_kernel.hip.cpp
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#include "oneflow/core/framework/framework.h"
-#include "oneflow/core/hip/elementwise.hip.h"
-#include "oneflow/user/kernels/loss_kernel_util.h"
-#include "oneflow/core/ep/rocm/cuda_stream.h"
-
-namespace oneflow {
-namespace user_op {
-namespace {
-
-using namespace loss;
-
-template<typename T>
-struct BinaryCrossEntropyFunctor {
-  T zero_;
-  T one_;
-  T negative_hundred_;
-  BinaryCrossEntropyFunctor()
-      : zero_(GetZeroVal<T>()), one_(GetOneVal<T>()), negative_hundred_(static_cast<T>(-100)) {}
-  __device__ __forceinline__ T operator()(T input_val, T target_val) const {
-    assert(input_val >= zero_);
-    assert(input_val <= one_);
-    return (target_val - one_) * max(static_cast<T>(log(one_ - input_val)), negative_hundred_)
-           - target_val * max(static_cast<T>(log(input_val)), negative_hundred_);
-  }
-
-  __device__ __forceinline__ T operator()(T input_val, T target_val, T weight_val) const {
-    return (*this)(input_val, target_val) * weight_val;
-  }
-};
-
-template<>
-struct BinaryCrossEntropyFunctor<float> {
-  float zero_;
-  float one_;
-  float negative_hundred_;
-  BinaryCrossEntropyFunctor() : zero_(0.f), one_(1.f), negative_hundred_(-100.f) {}
-  __device__ __forceinline__ float operator()(float input_val, float target_val) const {
-    assert(input_val >= zero_);
-    assert(input_val <= one_);
-    return (target_val - one_) * max(logf(one_ - input_val), negative_hundred_)
-           - target_val * max(logf(input_val), negative_hundred_);
-  }
-
-  __device__ __forceinline__ float operator()(float input_val, float target_val,
-                                              float weight_val) const {
-    return (*this)(input_val, target_val) * weight_val;
-  }
-};
-
-template<>
-struct BinaryCrossEntropyFunctor<half> {
-  BinaryCrossEntropyFunctor<float> float_functor;
-  __device__ __forceinline__ half operator()(half input_val, half target_val) const {
-    return __float2half(float_functor(__half2float(input_val), __half2float(target_val)));
-  }
-
-  __device__ __forceinline__ half operator()(half input_val, half target_val,
-                                             half weight_val) const {
-    return (*this)(input_val, target_val) * weight_val;
-  }
-};
-
-template<typename T>
-struct BinaryCrossEntropyGradFunctor {
-  T eps_;
-  T one_;
-  BinaryCrossEntropyGradFunctor() : eps_(static_cast<T>(1e-12)), one_(GetOneVal<T>()) {}
-  __device__ __forceinline__ T operator()(T input_val, T target_val, T dy_val) const {
-    return dy_val * (input_val - target_val) / max((one_ - input_val) * input_val, eps_);
-  }
-  __device__ __forceinline__ T operator()(T input_val, T target_val, T dy_val, T weight_val) const {
-    return (*this)(input_val, target_val, dy_val) * weight_val;
-  }
-};
-
-template<>
-struct BinaryCrossEntropyGradFunctor<half> {
-  BinaryCrossEntropyGradFunctor<float> float_functor;
-  BinaryCrossEntropyGradFunctor() {}
-  __device__ __forceinline__ half operator()(half input_val, half target_val, half dy_val) const {
-    return __float2half(
-        float_functor(__half2float(input_val), __half2float(target_val), __half2float(dy_val)));
-  }
-  __device__ __forceinline__ half operator()(half input_val, half target_val, half dy_val,
-                                             half weight_val) const {
-    return __float2half(float_functor(__half2float(input_val), __half2float(target_val),
-                                      __half2float(dy_val), __half2float(weight_val)));
-  }
-};
-
-template<typename T>
-class BinaryCrossEntropyKernel final : public user_op::OpKernel {
- public:
-  BinaryCrossEntropyKernel() = default;
-  ~BinaryCrossEntropyKernel() = default;
-
- private:
-  using user_op::OpKernel::Compute;
-  void Compute(user_op::KernelComputeContext* ctx) const override {
-    const auto* input_blob = ctx->Tensor4ArgNameAndIndex("input", 0);
-    const auto* target_blob = ctx->Tensor4ArgNameAndIndex("target", 0);
-    auto* out_blob = ctx->Tensor4ArgNameAndIndex("out", 0);
-
-    const int64_t elem_cnt = input_blob->shape_view().elem_cnt();
-
-    const T* input = input_blob->dptr<T>();
-    const T* target = target_blob->dptr<T>();
-    T* out = out_blob->mut_dptr<T>();
-
-    if (ctx->has_input("weight", 0)) {
-      const T* weight = ctx->Tensor4ArgNameAndIndex("weight", 0)->dptr<T>();
-      OF_CUDA_CHECK(
-          (cuda::elementwise::Ternary(BinaryCrossEntropyFunctor<T>(), elem_cnt, out, input, target,
-                                      weight, ctx->stream()->As<ep::CudaStream>()->cuda_stream())));
-    } else {
-      OF_CUDA_CHECK(
-          (cuda::elementwise::Binary(BinaryCrossEntropyFunctor<T>(), elem_cnt, out, input, target,
-                                     ctx->stream()->As<ep::CudaStream>()->cuda_stream())));
-    }
-  }
-  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
-};
-
-template<typename T>
-class BinaryCrossEntropyGradKernel final : public user_op::OpKernel {
- public:
-  BinaryCrossEntropyGradKernel() = default;
-  ~BinaryCrossEntropyGradKernel() = default;
-
- private:
-  using user_op::OpKernel::Compute;
-  void Compute(user_op::KernelComputeContext* ctx) const override {
-    const auto* input_blob = ctx->Tensor4ArgNameAndIndex("input", 0);
-    const auto* target_blob = ctx->Tensor4ArgNameAndIndex("target", 0);
-    const auto* dy_blob = ctx->Tensor4ArgNameAndIndex("dy", 0);
-    auto* dx_blob = ctx->Tensor4ArgNameAndIndex("dx", 0);
-
-    const int64_t elem_cnt = input_blob->shape_view().elem_cnt();
-
-    const T* dy = dy_blob->dptr<T>();
-    const T* input = input_blob->dptr<T>();
-    const T* target = target_blob->dptr<T>();
-    T* dx = dx_blob->mut_dptr<T>();
-
-    if (ctx->has_input("weight", 0)) {
-      const T* weight = ctx->Tensor4ArgNameAndIndex("weight", 0)->dptr<T>();
-      using FunctorT = BinaryCrossEntropyGradFunctor<T>;
-      using FactoryT = cuda::elementwise::SimpleFactory<FunctorT>;
-      OF_CUDA_CHECK((cuda::elementwise::GenericLauncher<FactoryT, T, T, T, T, T>::Launch(
-          FactoryT(FunctorT()), elem_cnt, dx, input, target, dy, weight,
-          ctx->stream()->As<ep::CudaStream>()->cuda_stream())));
-    } else {
-      OF_CUDA_CHECK((cuda::elementwise::Ternary(
-          BinaryCrossEntropyGradFunctor<T>(), elem_cnt, dx, input, target, dy,
-          ctx->stream()->As<ep::CudaStream>()->cuda_stream())));
-    }
-  }
-  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
-};
-
-}  // namespace
-
-#define REGISTER_BINARY_CROSS_ENTROPY_KERNEL(dtype)                                        \
-  REGISTER_USER_KERNEL("binary_cross_entropy")                                             \
-      .SetCreateFn<BinaryCrossEntropyKernel<dtype>>()                                      \
-      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA)                     \
-                       && (user_op::HobDataType("input", 0) == GetDataType<dtype>::value)  \
-                       && (user_op::HobDataType("target", 0) == GetDataType<dtype>::value) \
-                       && (user_op::HobDataType("out", 0) == GetDataType<dtype>::value));
-
-#define REGISTER_BINARY_CROSS_ENTROPY_GRAD_KERNEL(dtype)                                   \
-  REGISTER_USER_KERNEL("binary_cross_entropy_grad")                                        \
-      .SetCreateFn<BinaryCrossEntropyGradKernel<dtype>>()                                  \
-      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA)                     \
-                       && (user_op::HobDataType("input", 0) == GetDataType<dtype>::value)  \
-                       && (user_op::HobDataType("target", 0) == GetDataType<dtype>::value) \
-                       && (user_op::HobDataType("dy", 0) == GetDataType<dtype>::value)     \
-                       && (user_op::HobDataType("dx", 0) == GetDataType<dtype>::value));
-
-REGISTER_BINARY_CROSS_ENTROPY_KERNEL(half)
-REGISTER_BINARY_CROSS_ENTROPY_KERNEL(float)
-REGISTER_BINARY_CROSS_ENTROPY_KERNEL(double)
-
-REGISTER_BINARY_CROSS_ENTROPY_GRAD_KERNEL(half)
-REGISTER_BINARY_CROSS_ENTROPY_GRAD_KERNEL(float)
-REGISTER_BINARY_CROSS_ENTROPY_GRAD_KERNEL(double)
-
-}  // namespace user_op
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "oneflow/core/framework/framework.h"
+#include "oneflow/core/hip/elementwise.hip.h"
+#include "oneflow/user/kernels/loss_kernel_util.h"
+#include "oneflow/core/ep/rocm/cuda_stream.h"
+
+namespace oneflow {
+namespace user_op {
+namespace {
+
+using namespace loss;
+
+template<typename T>
+struct BinaryCrossEntropyFunctor {
+  T zero_;
+  T one_;
+  T negative_hundred_;
+  BinaryCrossEntropyFunctor()
+      : zero_(GetZeroVal<T>()), one_(GetOneVal<T>()), negative_hundred_(static_cast<T>(-100)) {}
+  __device__ __forceinline__ T operator()(T input_val, T target_val) const {
+    assert(input_val >= zero_);
+    assert(input_val <= one_);
+    return (target_val - one_) * max(static_cast<T>(log(one_ - input_val)), negative_hundred_)
+           - target_val * max(static_cast<T>(log(input_val)), negative_hundred_);
+  }
+
+  __device__ __forceinline__ T operator()(T input_val, T target_val, T weight_val) const {
+    return (*this)(input_val, target_val) * weight_val;
+  }
+};
+
+template<>
+struct BinaryCrossEntropyFunctor<float> {
+  float zero_;
+  float one_;
+  float negative_hundred_;
+  BinaryCrossEntropyFunctor() : zero_(0.f), one_(1.f), negative_hundred_(-100.f) {}
+  __device__ __forceinline__ float operator()(float input_val, float target_val) const {
+    assert(input_val >= zero_);
+    assert(input_val <= one_);
+    return (target_val - one_) * max(logf(one_ - input_val), negative_hundred_)
+           - target_val * max(logf(input_val), negative_hundred_);
+  }
+
+  __device__ __forceinline__ float operator()(float input_val, float target_val,
+                                              float weight_val) const {
+    return (*this)(input_val, target_val) * weight_val;
+  }
+};
+
+template<>
+struct BinaryCrossEntropyFunctor<half> {
+  BinaryCrossEntropyFunctor<float> float_functor;
+  __device__ __forceinline__ half operator()(half input_val, half target_val) const {
+    return __float2half(float_functor(__half2float(input_val), __half2float(target_val)));
+  }
+
+  __device__ __forceinline__ half operator()(half input_val, half target_val,
+                                             half weight_val) const {
+    return (*this)(input_val, target_val) * weight_val;
+  }
+};
+
+template<typename T>
+struct BinaryCrossEntropyGradFunctor {
+  T eps_;
+  T one_;
+  BinaryCrossEntropyGradFunctor() : eps_(static_cast<T>(1e-12)), one_(GetOneVal<T>()) {}
+  __device__ __forceinline__ T operator()(T input_val, T target_val, T dy_val) const {
+    return dy_val * (input_val - target_val) / max((one_ - input_val) * input_val, eps_);
+  }
+  __device__ __forceinline__ T operator()(T input_val, T target_val, T dy_val, T weight_val) const {
+    return (*this)(input_val, target_val, dy_val) * weight_val;
+  }
+};
+
+template<>
+struct BinaryCrossEntropyGradFunctor<half> {
+  BinaryCrossEntropyGradFunctor<float> float_functor;
+  BinaryCrossEntropyGradFunctor() {}
+  __device__ __forceinline__ half operator()(half input_val, half target_val, half dy_val) const {
+    return __float2half(
+        float_functor(__half2float(input_val), __half2float(target_val), __half2float(dy_val)));
+  }
+  __device__ __forceinline__ half operator()(half input_val, half target_val, half dy_val,
+                                             half weight_val) const {
+    return __float2half(float_functor(__half2float(input_val), __half2float(target_val),
+                                      __half2float(dy_val), __half2float(weight_val)));
+  }
+};
+
+template<typename T>
+class BinaryCrossEntropyKernel final : public user_op::OpKernel {
+ public:
+  BinaryCrossEntropyKernel() = default;
+  ~BinaryCrossEntropyKernel() = default;
+
+ private:
+  using user_op::OpKernel::Compute;
+  void Compute(user_op::KernelComputeContext* ctx) const override {
+    const auto* input_blob = ctx->Tensor4ArgNameAndIndex("input", 0);
+    const auto* target_blob = ctx->Tensor4ArgNameAndIndex("target", 0);
+    auto* out_blob = ctx->Tensor4ArgNameAndIndex("out", 0);
+
+    const int64_t elem_cnt = input_blob->shape_view().elem_cnt();
+
+    const T* input = input_blob->dptr<T>();
+    const T* target = target_blob->dptr<T>();
+    T* out = out_blob->mut_dptr<T>();
+
+    if (ctx->has_input("weight", 0)) {
+      const T* weight = ctx->Tensor4ArgNameAndIndex("weight", 0)->dptr<T>();
+      OF_CUDA_CHECK(
+          (cuda::elementwise::Ternary(BinaryCrossEntropyFunctor<T>(), elem_cnt, out, input, target,
+                                      weight, ctx->stream()->As<ep::CudaStream>()->cuda_stream())));
+    } else {
+      OF_CUDA_CHECK(
+          (cuda::elementwise::Binary(BinaryCrossEntropyFunctor<T>(), elem_cnt, out, input, target,
+                                     ctx->stream()->As<ep::CudaStream>()->cuda_stream())));
+    }
+  }
+  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
+};
+
+template<typename T>
+class BinaryCrossEntropyGradKernel final : public user_op::OpKernel {
+ public:
+  BinaryCrossEntropyGradKernel() = default;
+  ~BinaryCrossEntropyGradKernel() = default;
+
+ private:
+  using user_op::OpKernel::Compute;
+  void Compute(user_op::KernelComputeContext* ctx) const override {
+    const auto* input_blob = ctx->Tensor4ArgNameAndIndex("input", 0);
+    const auto* target_blob = ctx->Tensor4ArgNameAndIndex("target", 0);
+    const auto* dy_blob = ctx->Tensor4ArgNameAndIndex("dy", 0);
+    auto* dx_blob = ctx->Tensor4ArgNameAndIndex("dx", 0);
+
+    const int64_t elem_cnt = input_blob->shape_view().elem_cnt();
+
+    const T* dy = dy_blob->dptr<T>();
+    const T* input = input_blob->dptr<T>();
+    const T* target = target_blob->dptr<T>();
+    T* dx = dx_blob->mut_dptr<T>();
+
+    if (ctx->has_input("weight", 0)) {
+      const T* weight = ctx->Tensor4ArgNameAndIndex("weight", 0)->dptr<T>();
+      using FunctorT = BinaryCrossEntropyGradFunctor<T>;
+      using FactoryT = cuda::elementwise::SimpleFactory<FunctorT>;
+      OF_CUDA_CHECK((cuda::elementwise::GenericLauncher<FactoryT, T, T, T, T, T>::Launch(
+          FactoryT(FunctorT()), elem_cnt, dx, input, target, dy, weight,
+          ctx->stream()->As<ep::CudaStream>()->cuda_stream())));
+    } else {
+      OF_CUDA_CHECK((cuda::elementwise::Ternary(
+          BinaryCrossEntropyGradFunctor<T>(), elem_cnt, dx, input, target, dy,
+          ctx->stream()->As<ep::CudaStream>()->cuda_stream())));
+    }
+  }
+  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
+};
+
+}  // namespace
+
+#define REGISTER_BINARY_CROSS_ENTROPY_KERNEL(dtype)                                        \
+  REGISTER_USER_KERNEL("binary_cross_entropy")                                             \
+      .SetCreateFn<BinaryCrossEntropyKernel<dtype>>()                                      \
+      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA)                     \
+                       && (user_op::HobDataType("input", 0) == GetDataType<dtype>::value)  \
+                       && (user_op::HobDataType("target", 0) == GetDataType<dtype>::value) \
+                       && (user_op::HobDataType("out", 0) == GetDataType<dtype>::value));
+
+#define REGISTER_BINARY_CROSS_ENTROPY_GRAD_KERNEL(dtype)                                   \
+  REGISTER_USER_KERNEL("binary_cross_entropy_grad")                                        \
+      .SetCreateFn<BinaryCrossEntropyGradKernel<dtype>>()                                  \
+      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA)                     \
+                       && (user_op::HobDataType("input", 0) == GetDataType<dtype>::value)  \
+                       && (user_op::HobDataType("target", 0) == GetDataType<dtype>::value) \
+                       && (user_op::HobDataType("dy", 0) == GetDataType<dtype>::value)     \
+                       && (user_op::HobDataType("dx", 0) == GetDataType<dtype>::value));
+
+REGISTER_BINARY_CROSS_ENTROPY_KERNEL(half)
+REGISTER_BINARY_CROSS_ENTROPY_KERNEL(float)
+REGISTER_BINARY_CROSS_ENTROPY_KERNEL(double)
+
+REGISTER_BINARY_CROSS_ENTROPY_GRAD_KERNEL(half)
+REGISTER_BINARY_CROSS_ENTROPY_GRAD_KERNEL(float)
+REGISTER_BINARY_CROSS_ENTROPY_GRAD_KERNEL(double)
+
+}  // namespace user_op
 }  // namespace oneflow
\ No newline at end of file
--- a/oneflow/user/kernels/binary_cross_entropy_with_logits_kernel.hip.cpp
+++ b/oneflow/user/kernels/binary_cross_entropy_with_logits_kernel.hip.cpp
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#include "oneflow/core/framework/framework.h"
-#include "oneflow/core/hip/elementwise.hip.h"
-#include "oneflow/core/ndarray/ndarray_util.h"
-#include "oneflow/core/ndarray/xpu_var_ndarray.h"
-#include "oneflow/user/kernels/loss_kernel_util.h"
-#include "oneflow/core/ep/rocm/cuda_stream.h"
-
-namespace oneflow {
-namespace user_op {
-namespace {
-
-using namespace loss;
-
-enum class WeightType {
-  kNone,
-  kWeight,
-  kPosWeight,
-  kBoth,
-};
-
-template<typename T, WeightType WEIGHT_TYPE>
-struct BinaryCrossEntropyWithLogitsFunctor;
-
-template<typename T>
-struct BinaryCrossEntropyWithLogitsFunctor<T, WeightType::kNone> {
-  T zero_;
-  T one_;
-  BinaryCrossEntropyWithLogitsFunctor() : zero_(GetZeroVal<T>()), one_(GetOneVal<T>()) {}
-  __device__ __forceinline__ T operator()(T input_val, T target_val) const {
-    const T max_val = -input_val < zero_ ? zero_ : -input_val;
-    return (one_ - target_val) * input_val + max_val
-           + (log(exp(-max_val) + exp(-input_val - max_val)));
-  }
-};
-
-template<typename T>
-struct BinaryCrossEntropyWithLogitsFunctor<T, WeightType::kPosWeight> {
-  T zero_;
-  T one_;
-  BinaryCrossEntropyWithLogitsFunctor() : zero_(GetZeroVal<T>()), one_(GetOneVal<T>()) {}
-  __device__ __forceinline__ T operator()(T input_val, T target_val, T weight_val) const {
-    const T max_val = -input_val < zero_ ? zero_ : -input_val;
-    const T pos_weight_processed_val = weight_val - target_val + one_;
-    return (one_ - target_val) * input_val
-           + (pos_weight_processed_val
-              * (log(exp(-max_val) + exp(-input_val - max_val)) + max_val));
-  }
-};
-
-template<>
-struct BinaryCrossEntropyWithLogitsFunctor<float, WeightType::kNone> {
-  float zero_;
-  float one_;
-  BinaryCrossEntropyWithLogitsFunctor() : zero_(0.f), one_(1.f) {}
-  __device__ __forceinline__ float operator()(float input_val, float target_val) const {
-    const float max_val = -input_val < zero_ ? zero_ : -input_val;
-    return (one_ - target_val) * input_val + max_val
-           + (logf(expf(-max_val) + expf(-input_val - max_val)));
-  }
-};
-
-template<>
-struct BinaryCrossEntropyWithLogitsFunctor<float, WeightType::kPosWeight> {
-  float zero_;
-  float one_;
-  BinaryCrossEntropyWithLogitsFunctor() : zero_(0.f), one_(1.f) {}
-  __device__ __forceinline__ float operator()(float input_val, float target_val,
-                                              float weight_val) const {
-    const float max_val = -input_val < zero_ ? zero_ : -input_val;
-    const float pos_weight_processed_val = weight_val - target_val + one_;
-    return (one_ - target_val) * input_val
-           + (pos_weight_processed_val
-              * (logf(expf(-max_val) + expf(-input_val - max_val)) + max_val));
-  }
-};
-
-template<typename T>
-struct BinaryCrossEntropyWithLogitsFunctor<T, WeightType::kWeight> {
-  BinaryCrossEntropyWithLogitsFunctor<T, WeightType::kNone> f;
-  __device__ __forceinline__ T operator()(T input_val, T target_val, T weight_val) const {
-    return f(input_val, target_val) * weight_val;
-  }
-};
-
-template<typename T>
-struct BinaryCrossEntropyWithLogitsFunctor<T, WeightType::kBoth> {
-  BinaryCrossEntropyWithLogitsFunctor<T, WeightType::kPosWeight> f;
-  __device__ __forceinline__ T operator()(T input_val, T target_val, T weight_val,
-                                          T pos_weight_val) const {
-    return f(input_val, target_val, pos_weight_val) * weight_val;
-  }
-};
-
-template<>
-struct BinaryCrossEntropyWithLogitsFunctor<half, WeightType::kNone> {
-  BinaryCrossEntropyWithLogitsFunctor<float, WeightType::kNone> f;
-  __device__ __forceinline__ half operator()(half input_val, half target_val) const {
-    return __float2half(f(__half2float(input_val), __half2float(target_val)));
-  }
-};
-template<>
-struct BinaryCrossEntropyWithLogitsFunctor<half, WeightType::kPosWeight> {
-  BinaryCrossEntropyWithLogitsFunctor<float, WeightType::kPosWeight> f;
-  __device__ __forceinline__ half operator()(half input_val, half target_val,
-                                             half weight_val) const {
-    return __float2half(
-        f(__half2float(input_val), __half2float(target_val), __half2float(weight_val)));
-  }
-};
-template<>
-struct BinaryCrossEntropyWithLogitsFunctor<half, WeightType::kWeight> {
-  BinaryCrossEntropyWithLogitsFunctor<float, WeightType::kWeight> f;
-  __device__ __forceinline__ half operator()(half input_val, half target_val,
-                                             half weight_val) const {
-    return __float2half(
-        f(__half2float(input_val), __half2float(target_val), __half2float(weight_val)));
-  }
-};
-template<>
-struct BinaryCrossEntropyWithLogitsFunctor<half, WeightType::kBoth> {
-  BinaryCrossEntropyWithLogitsFunctor<float, WeightType::kBoth> f;
-  __device__ __forceinline__ half operator()(half input_val, half target_val, half weight_val,
-                                             half pos_weight_val) const {
-    return __float2half(f(__half2float(input_val), __half2float(target_val),
-                          __half2float(weight_val), __half2float(pos_weight_val)));
-  }
-};
-
-template<typename T>
-__device__ __forceinline__ T CalSigmoid(const T x) {
-  const T half_of_one = static_cast<T>(0.5);
-  return half_of_one * tanh(half_of_one * x) + half_of_one;
-}
-
-template<>
-__device__ __forceinline__ float CalSigmoid(const float x) {
-  const float half_of_one = static_cast<float>(0.5);
-  return half_of_one * tanhf(half_of_one * x) + half_of_one;
-}
-
-template<>
-__device__ __forceinline__ half CalSigmoid(const half x) {
-  return __float2half(CalSigmoid(__half2float(x)));
-}
-
-template<typename T, WeightType WEIGHT_TYPE>
-struct BinaryCrossEntropyWithLogitsGradFunctor;
-
-template<typename T>
-struct BinaryCrossEntropyWithLogitsGradFunctor<T, WeightType::kNone> {
-  __device__ __forceinline__ T operator()(T input_val, T target_val, T dy_val) const {
-    return (CalSigmoid(input_val) - target_val) * dy_val;
-  }
-};
-
-template<typename T>
-struct BinaryCrossEntropyWithLogitsGradFunctor<T, WeightType::kPosWeight> {
-  T one_;
-  BinaryCrossEntropyWithLogitsGradFunctor() : one_(GetOneVal<T>()) {}
-  __device__ __forceinline__ T operator()(T input_val, T target_val, T dy_val, T weight_val) const {
-    return dy_val * ((weight_val + one_ - target_val) * CalSigmoid(input_val) - weight_val);
-  }
-};
-
-template<typename T>
-struct BinaryCrossEntropyWithLogitsGradFunctor<T, WeightType::kWeight> {
-  BinaryCrossEntropyWithLogitsGradFunctor<T, WeightType::kNone> f;
-  __device__ __forceinline__ T operator()(T input_val, T target_val, T dy_val, T weight_val) const {
-    return f(input_val, target_val, dy_val) * weight_val;
-  }
-};
-
-template<typename T>
-struct BinaryCrossEntropyWithLogitsGradFunctor<T, WeightType::kBoth> {
-  BinaryCrossEntropyWithLogitsGradFunctor<T, WeightType::kPosWeight> f;
-  __device__ __forceinline__ T operator()(T input_val, T target_val, T dy_val, T weight_val,
-                                          T pos_weight_val) const {
-    return f(input_val, target_val, dy_val, pos_weight_val) * weight_val;
-  }
-};
-
-template<typename T>
-class BinaryCrossEntropyWithLogitsKernel final : public user_op::OpKernel {
- public:
-  BinaryCrossEntropyWithLogitsKernel() = default;
-  ~BinaryCrossEntropyWithLogitsKernel() override = default;
-
- private:
-  using user_op::OpKernel::Compute;
-  void Compute(user_op::KernelComputeContext* ctx) const override {
-    const auto* input_blob = ctx->Tensor4ArgNameAndIndex("input", 0);
-    const auto* target_blob = ctx->Tensor4ArgNameAndIndex("target", 0);
-    auto* out_blob = ctx->Tensor4ArgNameAndIndex("out", 0);
-    auto* tmp_buffer_blob = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0);
-
-    const int64_t elem_cnt = input_blob->shape_view().elem_cnt();
-
-    const T* input = input_blob->dptr<T>();
-    const T* target = target_blob->dptr<T>();
-    T* out = out_blob->mut_dptr<T>();
-
-    if (ctx->Attr<bool>("has_pos_weight")) {
-      T* pos_weight_processed = tmp_buffer_blob->mut_dptr<T>();
-      const T* pos_weight = ctx->Tensor4ArgNameAndIndex("pos_weight", 0)->dptr<T>();
-
-      Shape pos_weight_shape = Shape::Ones(target_blob->shape_view().NumAxes());
-      pos_weight_shape.Set(pos_weight_shape.NumAxes() - 1,
-                           ctx->Tensor4ArgNameAndIndex("pos_weight", 0)->shape_view().elem_cnt());
-      NdarrayUtil<DeviceType::kCUDA, T>::BroadcastMul(
-          ctx->stream(), XpuVarNdarray<T>(target_blob->shape_view(), pos_weight_processed),
-          XpuVarNdarray<const T>(pos_weight_shape, pos_weight),
-          XpuVarNdarray<const T>(target_blob->shape_view(), target));
-      if (ctx->has_input("weight", 0)) {
-        const T* weight = ctx->Tensor4ArgNameAndIndex("weight", 0)->dptr<T>();
-        using FunctorT = BinaryCrossEntropyWithLogitsFunctor<T, WeightType::kBoth>;
-        using FactoryT = cuda::elementwise::SimpleFactory<FunctorT>;
-        OF_CUDA_CHECK((cuda::elementwise::GenericLauncher<FactoryT, T, T, T, T, T>::Launch(
-            FactoryT(FunctorT()), elem_cnt, out, input, target, weight, pos_weight_processed,
-            ctx->stream()->As<ep::CudaStream>()->cuda_stream())));
-
-      } else {
-        OF_CUDA_CHECK((cuda::elementwise::Ternary(
-            BinaryCrossEntropyWithLogitsFunctor<T, WeightType::kPosWeight>(), elem_cnt, out, input,
-            target, pos_weight_processed, ctx->stream()->As<ep::CudaStream>()->cuda_stream())));
-      }
-    } else {
-      if (ctx->has_input("weight", 0)) {
-        const T* weight = ctx->Tensor4ArgNameAndIndex("weight", 0)->dptr<T>();
-        OF_CUDA_CHECK((cuda::elementwise::Ternary(
-            BinaryCrossEntropyWithLogitsFunctor<T, WeightType::kWeight>(), elem_cnt, out, input,
-            target, weight, ctx->stream()->As<ep::CudaStream>()->cuda_stream())));
-      } else {
-        OF_CUDA_CHECK((cuda::elementwise::Binary(
-            BinaryCrossEntropyWithLogitsFunctor<T, WeightType::kNone>(), elem_cnt, out, input,
-            target, ctx->stream()->As<ep::CudaStream>()->cuda_stream())));
-      }
-    }
-  }
-  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
-};
-
-template<typename T>
-class BinaryCrossEntropyWithLogitsGradKernel final : public user_op::OpKernel {
- public:
-  BinaryCrossEntropyWithLogitsGradKernel() = default;
-  ~BinaryCrossEntropyWithLogitsGradKernel() = default;
-
- private:
-  using user_op::OpKernel::Compute;
-  void Compute(user_op::KernelComputeContext* ctx) const override {
-    const auto* input_blob = ctx->Tensor4ArgNameAndIndex("input", 0);
-    const auto* target_blob = ctx->Tensor4ArgNameAndIndex("target", 0);
-    const auto* dy_blob = ctx->Tensor4ArgNameAndIndex("dy", 0);
-    auto* dx_blob = ctx->Tensor4ArgNameAndIndex("dx", 0);
-    auto* tmp_buffer_blob = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0);
-
-    const int64_t elem_cnt = input_blob->shape_view().elem_cnt();
-
-    const T* dy = dy_blob->dptr<T>();
-    const T* input = input_blob->dptr<T>();
-    const T* target = target_blob->dptr<T>();
-    T* dx = dx_blob->mut_dptr<T>();
-
-    if (ctx->Attr<bool>("has_pos_weight")) {
-      T* pos_weight_processed = tmp_buffer_blob->mut_dptr<T>();
-      const T* pos_weight = ctx->Tensor4ArgNameAndIndex("pos_weight", 0)->dptr<T>();
-
-      Shape pos_weight_shape = Shape::Ones(target_blob->shape_view().NumAxes());
-      pos_weight_shape.Set(pos_weight_shape.NumAxes() - 1,
-                           ctx->Tensor4ArgNameAndIndex("pos_weight", 0)->shape_view().elem_cnt());
-      NdarrayUtil<DeviceType::kCUDA, T>::BroadcastMul(
-          ctx->stream(), XpuVarNdarray<T>(target_blob->shape_view(), pos_weight_processed),
-          XpuVarNdarray<const T>(pos_weight_shape, pos_weight),
-          XpuVarNdarray<const T>(target_blob->shape_view(), target));
-
-      if (ctx->has_input("weight", 0)) {
-        const T* weight = ctx->Tensor4ArgNameAndIndex("weight", 0)->dptr<T>();
-        using FunctorT = BinaryCrossEntropyWithLogitsGradFunctor<T, WeightType::kBoth>;
-        using FactoryT = cuda::elementwise::SimpleFactory<FunctorT>;
-        OF_CUDA_CHECK((cuda::elementwise::GenericLauncher<FactoryT, T, T, T, T, T, T>::Launch(
-            FactoryT(FunctorT()), elem_cnt, dx, input, target, dy, weight, pos_weight_processed,
-            ctx->stream()->As<ep::CudaStream>()->cuda_stream())));
-
-      } else {
-        using FunctorT = BinaryCrossEntropyWithLogitsGradFunctor<T, WeightType::kPosWeight>;
-        using FactoryT = cuda::elementwise::SimpleFactory<FunctorT>;
-        OF_CUDA_CHECK((cuda::elementwise::GenericLauncher<FactoryT, T, T, T, T, T>::Launch(
-            FactoryT(FunctorT()), elem_cnt, dx, input, target, dy, pos_weight_processed,
-            ctx->stream()->As<ep::CudaStream>()->cuda_stream())));
-      }
-    } else {
-      if (ctx->has_input("weight", 0)) {
-        const T* weight = ctx->Tensor4ArgNameAndIndex("weight", 0)->dptr<T>();
-        using FunctorT = BinaryCrossEntropyWithLogitsGradFunctor<T, WeightType::kWeight>;
-        using FactoryT = cuda::elementwise::SimpleFactory<FunctorT>;
-        OF_CUDA_CHECK((cuda::elementwise::GenericLauncher<FactoryT, T, T, T, T, T>::Launch(
-            FactoryT(FunctorT()), elem_cnt, dx, input, target, dy, weight,
-            ctx->stream()->As<ep::CudaStream>()->cuda_stream())));
-      } else {
-        OF_CUDA_CHECK((cuda::elementwise::Ternary(
-            BinaryCrossEntropyWithLogitsGradFunctor<T, WeightType::kNone>(), elem_cnt, dx, input,
-            target, dy, ctx->stream()->As<ep::CudaStream>()->cuda_stream())));
-      }
-    }
-  }
-  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
-};
-
-template<typename T>
-user_op::InferTmpSizeFn GenFwInferTmpSizeFn() {
-  return [](user_op::InferContext* ctx) {
-    const int64_t n = ctx->InputShape("input", 0).elem_cnt();
-    size_t tmp_buffer_size = 0;
-    if (ctx->Attr<bool>("has_pos_weight")) { tmp_buffer_size += GetCudaAlignedSize(n * sizeof(T)); }
-    return tmp_buffer_size;
-  };
-}
-template<typename T>
-user_op::InferTmpSizeFn GenBwInferTmpSizeFn() {
-  return [](user_op::InferContext* ctx) {
-    const int64_t n = ctx->InputShape("target", 0).elem_cnt();
-    size_t tmp_buffer_size = 0;
-    if (ctx->Attr<bool>("has_pos_weight")) { tmp_buffer_size += GetCudaAlignedSize(n * sizeof(T)); }
-    return tmp_buffer_size;
-  };
-}
-
-}  // namespace
-
-#define REGISTER_BINARY_CROSS_ENTROPY_KERNEL(dtype)                                        \
-  REGISTER_USER_KERNEL("binary_cross_entropy_with_logits")                                 \
-      .SetCreateFn<BinaryCrossEntropyWithLogitsKernel<dtype>>()                            \
-      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA)                     \
-                       && (user_op::HobDataType("input", 0) == GetDataType<dtype>::value)  \
-                       && (user_op::HobDataType("target", 0) == GetDataType<dtype>::value) \
-                       && (user_op::HobDataType("out", 0) == GetDataType<dtype>::value))   \
-      .SetInferTmpSizeFn(GenFwInferTmpSizeFn<dtype>());
-
-#define REGISTER_BINARY_CROSS_ENTROPY_GRAD_KERNEL(dtype)                                   \
-  REGISTER_USER_KERNEL("binary_cross_entropy_with_logits_grad")                            \
-      .SetCreateFn<BinaryCrossEntropyWithLogitsGradKernel<dtype>>()                        \
-      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA)                     \
-                       && (user_op::HobDataType("input", 0) == GetDataType<dtype>::value)  \
-                       && (user_op::HobDataType("target", 0) == GetDataType<dtype>::value) \
-                       && (user_op::HobDataType("dy", 0) == GetDataType<dtype>::value)     \
-                       && (user_op::HobDataType("dx", 0) == GetDataType<dtype>::value))    \
-      .SetInferTmpSizeFn(GenBwInferTmpSizeFn<dtype>());
-
-REGISTER_BINARY_CROSS_ENTROPY_KERNEL(half)
-REGISTER_BINARY_CROSS_ENTROPY_KERNEL(float)
-REGISTER_BINARY_CROSS_ENTROPY_KERNEL(double)
-
-REGISTER_BINARY_CROSS_ENTROPY_GRAD_KERNEL(half)
-REGISTER_BINARY_CROSS_ENTROPY_GRAD_KERNEL(float)
-REGISTER_BINARY_CROSS_ENTROPY_GRAD_KERNEL(double)
-
-}  // namespace user_op
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "oneflow/core/framework/framework.h"
+#include "oneflow/core/hip/elementwise.hip.h"
+#include "oneflow/core/ndarray/ndarray_util.h"
+#include "oneflow/core/ndarray/xpu_var_ndarray.h"
+#include "oneflow/user/kernels/loss_kernel_util.h"
+#include "oneflow/core/ep/rocm/cuda_stream.h"
+
+namespace oneflow {
+namespace user_op {
+namespace {
+
+using namespace loss;
+
+enum class WeightType {
+  kNone,
+  kWeight,
+  kPosWeight,
+  kBoth,
+};
+
+template<typename T, WeightType WEIGHT_TYPE>
+struct BinaryCrossEntropyWithLogitsFunctor;
+
+template<typename T>
+struct BinaryCrossEntropyWithLogitsFunctor<T, WeightType::kNone> {
+  T zero_;
+  T one_;
+  BinaryCrossEntropyWithLogitsFunctor() : zero_(GetZeroVal<T>()), one_(GetOneVal<T>()) {}
+  __device__ __forceinline__ T operator()(T input_val, T target_val) const {
+    const T max_val = -input_val < zero_ ? zero_ : -input_val;
+    return (one_ - target_val) * input_val + max_val
+           + (log(exp(-max_val) + exp(-input_val - max_val)));
+  }
+};
+
+template<typename T>
+struct BinaryCrossEntropyWithLogitsFunctor<T, WeightType::kPosWeight> {
+  T zero_;
+  T one_;
+  BinaryCrossEntropyWithLogitsFunctor() : zero_(GetZeroVal<T>()), one_(GetOneVal<T>()) {}
+  __device__ __forceinline__ T operator()(T input_val, T target_val, T weight_val) const {
+    const T max_val = -input_val < zero_ ? zero_ : -input_val;
+    const T pos_weight_processed_val = weight_val - target_val + one_;
+    return (one_ - target_val) * input_val
+           + (pos_weight_processed_val
+              * (log(exp(-max_val) + exp(-input_val - max_val)) + max_val));
+  }
+};
+
+template<>
+struct BinaryCrossEntropyWithLogitsFunctor<float, WeightType::kNone> {
+  float zero_;
+  float one_;
+  BinaryCrossEntropyWithLogitsFunctor() : zero_(0.f), one_(1.f) {}
+  __device__ __forceinline__ float operator()(float input_val, float target_val) const {
+    const float max_val = -input_val < zero_ ? zero_ : -input_val;
+    return (one_ - target_val) * input_val + max_val
+           + (logf(expf(-max_val) + expf(-input_val - max_val)));
+  }
+};
+
+template<>
+struct BinaryCrossEntropyWithLogitsFunctor<float, WeightType::kPosWeight> {
+  float zero_;
+  float one_;
+  BinaryCrossEntropyWithLogitsFunctor() : zero_(0.f), one_(1.f) {}
+  __device__ __forceinline__ float operator()(float input_val, float target_val,
+                                              float weight_val) const {
+    const float max_val = -input_val < zero_ ? zero_ : -input_val;
+    const float pos_weight_processed_val = weight_val - target_val + one_;
+    return (one_ - target_val) * input_val
+           + (pos_weight_processed_val
+              * (logf(expf(-max_val) + expf(-input_val - max_val)) + max_val));
+  }
+};
+
+template<typename T>
+struct BinaryCrossEntropyWithLogitsFunctor<T, WeightType::kWeight> {
+  BinaryCrossEntropyWithLogitsFunctor<T, WeightType::kNone> f;
+  __device__ __forceinline__ T operator()(T input_val, T target_val, T weight_val) const {
+    return f(input_val, target_val) * weight_val;
+  }
+};
+
+template<typename T>
+struct BinaryCrossEntropyWithLogitsFunctor<T, WeightType::kBoth> {
+  BinaryCrossEntropyWithLogitsFunctor<T, WeightType::kPosWeight> f;
+  __device__ __forceinline__ T operator()(T input_val, T target_val, T weight_val,
+                                          T pos_weight_val) const {
+    return f(input_val, target_val, pos_weight_val) * weight_val;
+  }
+};
+
+template<>
+struct BinaryCrossEntropyWithLogitsFunctor<half, WeightType::kNone> {
+  BinaryCrossEntropyWithLogitsFunctor<float, WeightType::kNone> f;
+  __device__ __forceinline__ half operator()(half input_val, half target_val) const {
+    return __float2half(f(__half2float(input_val), __half2float(target_val)));
+  }
+};
+template<>
+struct BinaryCrossEntropyWithLogitsFunctor<half, WeightType::kPosWeight> {
+  BinaryCrossEntropyWithLogitsFunctor<float, WeightType::kPosWeight> f;
+  __device__ __forceinline__ half operator()(half input_val, half target_val,
+                                             half weight_val) const {
+    return __float2half(
+        f(__half2float(input_val), __half2float(target_val), __half2float(weight_val)));
+  }
+};
+template<>
+struct BinaryCrossEntropyWithLogitsFunctor<half, WeightType::kWeight> {
+  BinaryCrossEntropyWithLogitsFunctor<float, WeightType::kWeight> f;
+  __device__ __forceinline__ half operator()(half input_val, half target_val,
+                                             half weight_val) const {
+    return __float2half(
+        f(__half2float(input_val), __half2float(target_val), __half2float(weight_val)));
+  }
+};
+template<>
+struct BinaryCrossEntropyWithLogitsFunctor<half, WeightType::kBoth> {
+  BinaryCrossEntropyWithLogitsFunctor<float, WeightType::kBoth> f;
+  __device__ __forceinline__ half operator()(half input_val, half target_val, half weight_val,
+                                             half pos_weight_val) const {
+    return __float2half(f(__half2float(input_val), __half2float(target_val),
+                          __half2float(weight_val), __half2float(pos_weight_val)));
+  }
+};
+
+template<typename T>
+__device__ __forceinline__ T CalSigmoid(const T x) {
+  const T half_of_one = static_cast<T>(0.5);
+  return half_of_one * tanh(half_of_one * x) + half_of_one;
+}
+
+template<>
+__device__ __forceinline__ float CalSigmoid(const float x) {
+  const float half_of_one = static_cast<float>(0.5);
+  return half_of_one * tanhf(half_of_one * x) + half_of_one;
+}
+
+template<>
+__device__ __forceinline__ half CalSigmoid(const half x) {
+  return __float2half(CalSigmoid(__half2float(x)));
+}
+
+template<typename T, WeightType WEIGHT_TYPE>
+struct BinaryCrossEntropyWithLogitsGradFunctor;
+
+template<typename T>
+struct BinaryCrossEntropyWithLogitsGradFunctor<T, WeightType::kNone> {
+  __device__ __forceinline__ T operator()(T input_val, T target_val, T dy_val) const {
+    return (CalSigmoid(input_val) - target_val) * dy_val;
+  }
+};
+
+template<typename T>
+struct BinaryCrossEntropyWithLogitsGradFunctor<T, WeightType::kPosWeight> {
+  T one_;
+  BinaryCrossEntropyWithLogitsGradFunctor() : one_(GetOneVal<T>()) {}
+  __device__ __forceinline__ T operator()(T input_val, T target_val, T dy_val, T weight_val) const {
+    return dy_val * ((weight_val + one_ - target_val) * CalSigmoid(input_val) - weight_val);
+  }
+};
+
+template<typename T>
+struct BinaryCrossEntropyWithLogitsGradFunctor<T, WeightType::kWeight> {
+  BinaryCrossEntropyWithLogitsGradFunctor<T, WeightType::kNone> f;
+  __device__ __forceinline__ T operator()(T input_val, T target_val, T dy_val, T weight_val) const {
+    return f(input_val, target_val, dy_val) * weight_val;
+  }
+};
+
+template<typename T>
+struct BinaryCrossEntropyWithLogitsGradFunctor<T, WeightType::kBoth> {
+  BinaryCrossEntropyWithLogitsGradFunctor<T, WeightType::kPosWeight> f;
+  __device__ __forceinline__ T operator()(T input_val, T target_val, T dy_val, T weight_val,
+                                          T pos_weight_val) const {
+    return f(input_val, target_val, dy_val, pos_weight_val) * weight_val;
+  }
+};
+
+template<typename T>
+class BinaryCrossEntropyWithLogitsKernel final : public user_op::OpKernel {
+ public:
+  BinaryCrossEntropyWithLogitsKernel() = default;
+  ~BinaryCrossEntropyWithLogitsKernel() override = default;
+
+ private:
+  using user_op::OpKernel::Compute;
+  void Compute(user_op::KernelComputeContext* ctx) const override {
+    const auto* input_blob = ctx->Tensor4ArgNameAndIndex("input", 0);
+    const auto* target_blob = ctx->Tensor4ArgNameAndIndex("target", 0);
+    auto* out_blob = ctx->Tensor4ArgNameAndIndex("out", 0);
+    auto* tmp_buffer_blob = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0);
+
+    const int64_t elem_cnt = input_blob->shape_view().elem_cnt();
+
+    const T* input = input_blob->dptr<T>();
+    const T* target = target_blob->dptr<T>();
+    T* out = out_blob->mut_dptr<T>();
+
+    if (ctx->Attr<bool>("has_pos_weight")) {
+      T* pos_weight_processed = tmp_buffer_blob->mut_dptr<T>();
+      const T* pos_weight = ctx->Tensor4ArgNameAndIndex("pos_weight", 0)->dptr<T>();
+
+      Shape pos_weight_shape = Shape::Ones(target_blob->shape_view().NumAxes());
+      pos_weight_shape.Set(pos_weight_shape.NumAxes() - 1,
+                           ctx->Tensor4ArgNameAndIndex("pos_weight", 0)->shape_view().elem_cnt());
+      NdarrayUtil<DeviceType::kCUDA, T>::BroadcastMul(
+          ctx->stream(), XpuVarNdarray<T>(target_blob->shape_view(), pos_weight_processed),
+          XpuVarNdarray<const T>(pos_weight_shape, pos_weight),
+          XpuVarNdarray<const T>(target_blob->shape_view(), target));
+      if (ctx->has_input("weight", 0)) {
+        const T* weight = ctx->Tensor4ArgNameAndIndex("weight", 0)->dptr<T>();
+        using FunctorT = BinaryCrossEntropyWithLogitsFunctor<T, WeightType::kBoth>;
+        using FactoryT = cuda::elementwise::SimpleFactory<FunctorT>;
+        OF_CUDA_CHECK((cuda::elementwise::GenericLauncher<FactoryT, T, T, T, T, T>::Launch(
+            FactoryT(FunctorT()), elem_cnt, out, input, target, weight, pos_weight_processed,
+            ctx->stream()->As<ep::CudaStream>()->cuda_stream())));
+
+      } else {
+        OF_CUDA_CHECK((cuda::elementwise::Ternary(
+            BinaryCrossEntropyWithLogitsFunctor<T, WeightType::kPosWeight>(), elem_cnt, out, input,
+            target, pos_weight_processed, ctx->stream()->As<ep::CudaStream>()->cuda_stream())));
+      }
+    } else {
+      if (ctx->has_input("weight", 0)) {
+        const T* weight = ctx->Tensor4ArgNameAndIndex("weight", 0)->dptr<T>();
+        OF_CUDA_CHECK((cuda::elementwise::Ternary(
+            BinaryCrossEntropyWithLogitsFunctor<T, WeightType::kWeight>(), elem_cnt, out, input,
+            target, weight, ctx->stream()->As<ep::CudaStream>()->cuda_stream())));
+      } else {
+        OF_CUDA_CHECK((cuda::elementwise::Binary(
+            BinaryCrossEntropyWithLogitsFunctor<T, WeightType::kNone>(), elem_cnt, out, input,
+            target, ctx->stream()->As<ep::CudaStream>()->cuda_stream())));
+      }
+    }
+  }
+  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
+};
+
+template<typename T>
+class BinaryCrossEntropyWithLogitsGradKernel final : public user_op::OpKernel {
+ public:
+  BinaryCrossEntropyWithLogitsGradKernel() = default;
+  ~BinaryCrossEntropyWithLogitsGradKernel() = default;
+
+ private:
+  using user_op::OpKernel::Compute;
+  void Compute(user_op::KernelComputeContext* ctx) const override {
+    const auto* input_blob = ctx->Tensor4ArgNameAndIndex("input", 0);
+    const auto* target_blob = ctx->Tensor4ArgNameAndIndex("target", 0);
+    const auto* dy_blob = ctx->Tensor4ArgNameAndIndex("dy", 0);
+    auto* dx_blob = ctx->Tensor4ArgNameAndIndex("dx", 0);
+    auto* tmp_buffer_blob = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0);
+
+    const int64_t elem_cnt = input_blob->shape_view().elem_cnt();
+
+    const T* dy = dy_blob->dptr<T>();
+    const T* input = input_blob->dptr<T>();
+    const T* target = target_blob->dptr<T>();
+    T* dx = dx_blob->mut_dptr<T>();
+
+    if (ctx->Attr<bool>("has_pos_weight")) {
+      T* pos_weight_processed = tmp_buffer_blob->mut_dptr<T>();
+      const T* pos_weight = ctx->Tensor4ArgNameAndIndex("pos_weight", 0)->dptr<T>();
+
+      Shape pos_weight_shape = Shape::Ones(target_blob->shape_view().NumAxes());
+      pos_weight_shape.Set(pos_weight_shape.NumAxes() - 1,
+                           ctx->Tensor4ArgNameAndIndex("pos_weight", 0)->shape_view().elem_cnt());
+      NdarrayUtil<DeviceType::kCUDA, T>::BroadcastMul(
+          ctx->stream(), XpuVarNdarray<T>(target_blob->shape_view(), pos_weight_processed),
+          XpuVarNdarray<const T>(pos_weight_shape, pos_weight),
+          XpuVarNdarray<const T>(target_blob->shape_view(), target));
+
+      if (ctx->has_input("weight", 0)) {
+        const T* weight = ctx->Tensor4ArgNameAndIndex("weight", 0)->dptr<T>();
+        using FunctorT = BinaryCrossEntropyWithLogitsGradFunctor<T, WeightType::kBoth>;
+        using FactoryT = cuda::elementwise::SimpleFactory<FunctorT>;
+        OF_CUDA_CHECK((cuda::elementwise::GenericLauncher<FactoryT, T, T, T, T, T, T>::Launch(
+            FactoryT(FunctorT()), elem_cnt, dx, input, target, dy, weight, pos_weight_processed,
+            ctx->stream()->As<ep::CudaStream>()->cuda_stream())));
+
+      } else {
+        using FunctorT = BinaryCrossEntropyWithLogitsGradFunctor<T, WeightType::kPosWeight>;
+        using FactoryT = cuda::elementwise::SimpleFactory<FunctorT>;
+        OF_CUDA_CHECK((cuda::elementwise::GenericLauncher<FactoryT, T, T, T, T, T>::Launch(
+            FactoryT(FunctorT()), elem_cnt, dx, input, target, dy, pos_weight_processed,
+            ctx->stream()->As<ep::CudaStream>()->cuda_stream())));
+      }
+    } else {
+      if (ctx->has_input("weight", 0)) {
+        const T* weight = ctx->Tensor4ArgNameAndIndex("weight", 0)->dptr<T>();
+        using FunctorT = BinaryCrossEntropyWithLogitsGradFunctor<T, WeightType::kWeight>;
+        using FactoryT = cuda::elementwise::SimpleFactory<FunctorT>;
+        OF_CUDA_CHECK((cuda::elementwise::GenericLauncher<FactoryT, T, T, T, T, T>::Launch(
+            FactoryT(FunctorT()), elem_cnt, dx, input, target, dy, weight,
+            ctx->stream()->As<ep::CudaStream>()->cuda_stream())));
+      } else {
+        OF_CUDA_CHECK((cuda::elementwise::Ternary(
+            BinaryCrossEntropyWithLogitsGradFunctor<T, WeightType::kNone>(), elem_cnt, dx, input,
+            target, dy, ctx->stream()->As<ep::CudaStream>()->cuda_stream())));
+      }
+    }
+  }
+  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
+};
+
+template<typename T>
+user_op::InferTmpSizeFn GenFwInferTmpSizeFn() {
+  return [](user_op::InferContext* ctx) {
+    const int64_t n = ctx->InputShape("input", 0).elem_cnt();
+    size_t tmp_buffer_size = 0;
+    if (ctx->Attr<bool>("has_pos_weight")) { tmp_buffer_size += GetCudaAlignedSize(n * sizeof(T)); }
+    return tmp_buffer_size;
+  };
+}
+template<typename T>
+user_op::InferTmpSizeFn GenBwInferTmpSizeFn() {
+  return [](user_op::InferContext* ctx) {
+    const int64_t n = ctx->InputShape("target", 0).elem_cnt();
+    size_t tmp_buffer_size = 0;
+    if (ctx->Attr<bool>("has_pos_weight")) { tmp_buffer_size += GetCudaAlignedSize(n * sizeof(T)); }
+    return tmp_buffer_size;
+  };
+}
+
+}  // namespace
+
+#define REGISTER_BINARY_CROSS_ENTROPY_KERNEL(dtype)                                        \
+  REGISTER_USER_KERNEL("binary_cross_entropy_with_logits")                                 \
+      .SetCreateFn<BinaryCrossEntropyWithLogitsKernel<dtype>>()                            \
+      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA)                     \
+                       && (user_op::HobDataType("input", 0) == GetDataType<dtype>::value)  \
+                       && (user_op::HobDataType("target", 0) == GetDataType<dtype>::value) \
+                       && (user_op::HobDataType("out", 0) == GetDataType<dtype>::value))   \
+      .SetInferTmpSizeFn(GenFwInferTmpSizeFn<dtype>());
+
+#define REGISTER_BINARY_CROSS_ENTROPY_GRAD_KERNEL(dtype)                                   \
+  REGISTER_USER_KERNEL("binary_cross_entropy_with_logits_grad")                            \
+      .SetCreateFn<BinaryCrossEntropyWithLogitsGradKernel<dtype>>()                        \
+      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA)                     \
+                       && (user_op::HobDataType("input", 0) == GetDataType<dtype>::value)  \
+                       && (user_op::HobDataType("target", 0) == GetDataType<dtype>::value) \
+                       && (user_op::HobDataType("dy", 0) == GetDataType<dtype>::value)     \
+                       && (user_op::HobDataType("dx", 0) == GetDataType<dtype>::value))    \
+      .SetInferTmpSizeFn(GenBwInferTmpSizeFn<dtype>());
+
+REGISTER_BINARY_CROSS_ENTROPY_KERNEL(half)
+REGISTER_BINARY_CROSS_ENTROPY_KERNEL(float)
+REGISTER_BINARY_CROSS_ENTROPY_KERNEL(double)
+
+REGISTER_BINARY_CROSS_ENTROPY_GRAD_KERNEL(half)
+REGISTER_BINARY_CROSS_ENTROPY_GRAD_KERNEL(float)
+REGISTER_BINARY_CROSS_ENTROPY_GRAD_KERNEL(double)
+
+}  // namespace user_op
 }  // namespace oneflow
\ No newline at end of file
--- a/oneflow/user/kernels/binary_cross_entropy_with_logits_mean_kernel.hip.cpp
+++ b/oneflow/user/kernels/binary_cross_entropy_with_logits_mean_kernel.hip.cpp
-#include "hip/hip_runtime.h"
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#include "oneflow/user/kernels/binary_cross_entropy_with_logits_mean_kernel_util.h"
-#include "oneflow/core/ep/rocm/cuda_stream.h"
-#include "oneflow/core/hip/elementwise.hip.h"
-#include <hipcub/hipcub.hpp>
-#include "oneflow/core/kernel/cuda_graph_support.h"
-
-namespace oneflow {
-
-namespace user_op {
-
-namespace {
-
-constexpr int32_t kBlockSize = 1024;
-constexpr int32_t kReduceLocalSumBlockSize = 1024;
-constexpr int32_t kSingleBlockProcessNumThreshold = 1024;
-
-template<typename T>
-struct DefaultComputeType {
-  using type = T;
-};
-
-template<>
-struct DefaultComputeType<half> {
-  using type = float;
-};
-
-template<class Func>
-inline hipError_t GetNumBlocks(Func func, int64_t block_size, size_t dynamic_smem_size,
-                                int64_t max_blocks, int64_t waves, int* num_blocks) {
-  int dev;
-  {
-    hipError_t err = hipGetDevice(&dev);
-    if (err != hipSuccess) { return err; }
-  }
-  int sm_count;
-  {
-    hipError_t err = hipDeviceGetAttribute(&sm_count, hipDeviceAttributeMultiprocessorCount, dev);
-    if (err != hipSuccess) { return err; }
-  }
-  int max_active_blocks;
-  {
-    hipError_t err = hipOccupancyMaxActiveBlocksPerMultiprocessor(&max_active_blocks, func,
-                                                                    block_size, dynamic_smem_size);
-  }
-  *num_blocks =
-      std::max<int>(1, std::min<int64_t>(max_blocks, sm_count * max_active_blocks * waves));
-  return hipSuccess;
-}
-
-template<typename In, typename Out, typename ComputeType>
-__global__ void FusedBinaryCrossEntropyWithLogitsReduceMeanKernel(const In* input, const In* target,
-                                                                  Out* out,
-                                                                  const int32_t local_elem_cnt,
-                                                                  const int32_t reduce_elem_cnt) {
-  ComputeType zero = static_cast<ComputeType>(0.0);
-  ComputeType one = static_cast<ComputeType>(1.0);
-  using BlockReduce = hipcub::BlockReduce<ComputeType, kBlockSize>;
-  __shared__ typename BlockReduce::TempStorage temp_storage;
-  ComputeType reduce_sum = 0.0;
-  CUDA_1D_KERNEL_LOOP(i, local_elem_cnt) {
-    const ComputeType input_val = static_cast<ComputeType>(input[i]);
-    const ComputeType target_val = static_cast<ComputeType>(target[i]);
-    const ComputeType max_val = -input_val < zero ? zero : -input_val;
-    const ComputeType result =
-        (one - target_val) * input_val + max_val + (log(exp(-max_val) + exp(-input_val - max_val)));
-    reduce_sum += result;
-  }
-
-  const ComputeType block_reduce_sum = BlockReduce(temp_storage).Sum(reduce_sum);
-  if (threadIdx.x == 0) { out[blockIdx.x] = static_cast<Out>(block_reduce_sum / reduce_elem_cnt); }
-}
-
-template<typename Out, typename ComputeType>
-__global__ void ReduceLocalSumKernel(ComputeType* block_local_sum_buf, Out* out, int64_t elem_cnt) {
-  using BlockReduce = hipcub::BlockReduce<ComputeType, kReduceLocalSumBlockSize>;
-  __shared__ typename BlockReduce::TempStorage temp_storage;
-  ComputeType reduce_sum = 0.0;
-  CUDA_1D_KERNEL_LOOP(i, elem_cnt) { reduce_sum += block_local_sum_buf[i]; }
-  const ComputeType block_reduce_sum = BlockReduce(temp_storage).Sum(reduce_sum);
-  if (threadIdx.x == 0) { out[0] = static_cast<Out>(block_reduce_sum); }
-}
-
-template<typename T>
-__device__ __forceinline__ T Sigmoid(const T x) {
-  const T half_of_one = static_cast<T>(0.5);
-  return half_of_one * tanh(half_of_one * x) + half_of_one;
-}
-
-template<>
-__device__ __forceinline__ half Sigmoid(const half x) {
-  return __float2half(Sigmoid(__half2float(x)));
-}
-
-template<typename T, typename ComputeType>
-struct BinaryCrossEntropyWithLogitsReduceMeanGradFunctor {
-  OF_DEVICE_FUNC explicit BinaryCrossEntropyWithLogitsReduceMeanGradFunctor(
-      const T elem_cnt_reciprocal, const T dy)
-      : elem_cnt_reciprocal(elem_cnt_reciprocal), dy(dy) {}
-  __device__ T operator()(const T input_val, const T target_val) const {
-    return (Sigmoid(input_val) - target_val) * dy * elem_cnt_reciprocal;
-  }
-  const T dy;
-  const T elem_cnt_reciprocal;
-};
-
-template<typename T, typename ComputeType>
-struct BinaryCrossEntropyWithLogitsReduceMeanGradDyptrFunctor {
-  OF_DEVICE_FUNC explicit BinaryCrossEntropyWithLogitsReduceMeanGradDyptrFunctor(
-      const int32_t elem_cnt, const T* dy_ptr)
-      : elem_cnt_reciprocal(1.0f / elem_cnt), dy_ptr(dy_ptr) {}
-  __device__ BinaryCrossEntropyWithLogitsReduceMeanGradFunctor<T, ComputeType> operator()() const {
-    return BinaryCrossEntropyWithLogitsReduceMeanGradFunctor<T, ComputeType>(elem_cnt_reciprocal,
-                                                                             *dy_ptr);
-  }
-  const T* dy_ptr;
-  const T elem_cnt_reciprocal;
-};
-
-template<typename T>
-class BinaryCrossEntropyWithLogitsMeanKernel final : public user_op::OpKernel,
-                                                     public CudaGraphSupport {
- public:
-  BinaryCrossEntropyWithLogitsMeanKernel() = default;
-  ~BinaryCrossEntropyWithLogitsMeanKernel() override = default;
-  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
-
-  std::shared_ptr<user_op::OpKernelCache> InitOpKernelCache(
-      user_op::KernelCacheContext* ctx) const override {
-    return CreateBCEWithLogitsReduceMeanKernelCache(ctx);
-  }
-
- private:
-  using user_op::OpKernel::Compute;
-  void Compute(user_op::KernelComputeContext* ctx, user_op::OpKernelState* state,
-               const user_op::OpKernelCache* cache) const override {
-    const auto* input_blob = ctx->Tensor4ArgNameAndIndex("input", 0);
-    const auto* target_blob = ctx->Tensor4ArgNameAndIndex("target", 0);
-    auto* out_blob = ctx->Tensor4ArgNameAndIndex("out", 0);
-
-    int64_t local_elem_cnt = input_blob->shape_view().elem_cnt();
-    int64_t reduce_elem_cnt = local_elem_cnt;
-
-    if (cache != nullptr) {
-      // Because `out`'s SBP maybe P or B, we need to use reduce_elem_cnt as reduce_mean factor.
-      const auto* bce_cache = dynamic_cast<const BCEWithLogitsReduceMeanKernelCache*>(cache);
-      CHECK_NOTNULL(bce_cache);
-      reduce_elem_cnt = bce_cache->reduce_elem_cnt();
-    }
-
-    const T* input = input_blob->dptr<T>();
-    const T* target = target_blob->dptr<T>();
-    T* out = out_blob->mut_dptr<T>();
-    using ComputeType = typename DefaultComputeType<T>::type;
-
-    if (local_elem_cnt <= kSingleBlockProcessNumThreshold) {
-      FusedBinaryCrossEntropyWithLogitsReduceMeanKernel<T, T, ComputeType>
-          <<<1, kBlockSize, 0, ctx->stream()->As<ep::CudaStream>()->cuda_stream()>>>(
-              input_blob->dptr<T>(), target_blob->dptr<T>(), out_blob->mut_dptr<T>(),
-              local_elem_cnt, reduce_elem_cnt);
-    } else {
-      auto* tmp_buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0);
-      const int64_t tmp_buffer_elem_cnt = tmp_buffer->shape_view().elem_cnt() / sizeof(T);
-      const int64_t block_num = (local_elem_cnt + kBlockSize - 1) / kBlockSize;
-      int launch_block = block_num;
-      OF_CUDA_CHECK(GetNumBlocks(
-          FusedBinaryCrossEntropyWithLogitsReduceMeanKernel<T, ComputeType, ComputeType>,
-          kBlockSize, 0, block_num, 32, &launch_block));
-      launch_block = std::min<int32_t>(tmp_buffer_elem_cnt, launch_block);
-      FusedBinaryCrossEntropyWithLogitsReduceMeanKernel<T, ComputeType, ComputeType>
-          <<<launch_block, kBlockSize, 0, ctx->stream()->As<ep::CudaStream>()->cuda_stream()>>>(
-              input_blob->dptr<T>(), target_blob->dptr<T>(), tmp_buffer->mut_dptr<ComputeType>(),
-              local_elem_cnt, reduce_elem_cnt);
-      ReduceLocalSumKernel<T, ComputeType>
-          <<<1, kReduceLocalSumBlockSize, 0, ctx->stream()->As<ep::CudaStream>()->cuda_stream()>>>(
-              tmp_buffer->mut_dptr<ComputeType>(), out_blob->mut_dptr<T>(), block_num);
-    }
-  }
-};
-
-template<typename T>
-class BinaryCrossEntropyWithLogitsReduceMeanGradKernel final : public user_op::OpKernel {
- public:
-  BinaryCrossEntropyWithLogitsReduceMeanGradKernel() = default;
-  ~BinaryCrossEntropyWithLogitsReduceMeanGradKernel() = default;
-  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
-
-  std::shared_ptr<user_op::OpKernelCache> InitOpKernelCache(
-      user_op::KernelCacheContext* ctx) const override {
-    return CreateBCEWithLogitsReduceMeanKernelCache(ctx);
-  }
-
- private:
-  using user_op::OpKernel::Compute;
-  void Compute(user_op::KernelComputeContext* ctx, user_op::OpKernelState* state,
-               const user_op::OpKernelCache* cache) const override {
-    const auto* input_blob = ctx->Tensor4ArgNameAndIndex("input", 0);
-    const auto* target_blob = ctx->Tensor4ArgNameAndIndex("target", 0);
-    const auto* dy_blob = ctx->Tensor4ArgNameAndIndex("dy", 0);
-    auto* dx_blob = ctx->Tensor4ArgNameAndIndex("dx", 0);
-
-    int64_t local_elem_cnt = input_blob->shape_view().elem_cnt();
-    int64_t reduce_elem_cnt = local_elem_cnt;
-    if (cache != nullptr) {
-      // Because `out`'s SBP maybe P or B, we need to use reduce_elem_cnt as reduce_mean factor.
-      const auto* bce_cache = dynamic_cast<const BCEWithLogitsReduceMeanKernelCache*>(cache);
-      CHECK_NOTNULL(bce_cache);
-      reduce_elem_cnt = bce_cache->reduce_elem_cnt();
-    }
-
-    const T* dy = dy_blob->dptr<T>();
-    const T* input = input_blob->dptr<T>();
-    const T* target = target_blob->dptr<T>();
-    T* dx = dx_blob->mut_dptr<T>();
-    using ComputeType = typename DefaultComputeType<T>::type;
-
-    OF_CUDA_CHECK((cuda::elementwise::BinaryWithFactory(
-        BinaryCrossEntropyWithLogitsReduceMeanGradDyptrFunctor<T, ComputeType>(reduce_elem_cnt, dy),
-        local_elem_cnt, dx, input, target, ctx->stream()->As<ep::CudaStream>()->cuda_stream())));
-  }
-};
-
-}  // namespace
-
-#define REGISTER_BINARY_CROSS_ENTROPY_REDUCE_MEAN_KERNEL(dtype)                                 \
-  REGISTER_USER_KERNEL("binary_cross_entropy_with_logits_reduce_mean")                          \
-      .SetCreateFn<BinaryCrossEntropyWithLogitsMeanKernel<dtype>>()                             \
-      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA)                          \
-                       && (user_op::HobDataType("input", 0) == GetDataType<dtype>::value)       \
-                       && (user_op::HobDataType("target", 0) == GetDataType<dtype>::value)      \
-                       && (user_op::HobDataType("out", 0) == GetDataType<dtype>::value))        \
-      .SetInferTmpSizeFn([](user_op::InferContext* ctx) {                                       \
-        const int64_t elem_cnt = ctx->InputShape("input", 0).elem_cnt();                        \
-        const int64_t block_num = (elem_cnt + kBlockSize - 1) / kBlockSize;                     \
-        int launch_block = block_num;                                                           \
-        using ComputeType = typename DefaultComputeType<dtype>::type;                           \
-        OF_CUDA_CHECK(GetNumBlocks(                                                             \
-            FusedBinaryCrossEntropyWithLogitsReduceMeanKernel<dtype, ComputeType, ComputeType>, \
-            kBlockSize, 0, block_num, 32, &launch_block));                                      \
-        const int64_t tmp_buffer_size = GetCudaAlignedSize(launch_block * sizeof(dtype));       \
-        return tmp_buffer_size;                                                                 \
-      });
-
-#define REGISTER_BINARY_CROSS_ENTROPY_REDUCE_MEAN_GRAD_KERNEL(dtype)                       \
-  REGISTER_USER_KERNEL("binary_cross_entropy_with_logits_reduce_mean_grad")                \
-      .SetCreateFn<BinaryCrossEntropyWithLogitsReduceMeanGradKernel<dtype>>()              \
-      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA)                     \
-                       && (user_op::HobDataType("input", 0) == GetDataType<dtype>::value)  \
-                       && (user_op::HobDataType("target", 0) == GetDataType<dtype>::value) \
-                       && (user_op::HobDataType("dy", 0) == GetDataType<dtype>::value)     \
-                       && (user_op::HobDataType("dx", 0) == GetDataType<dtype>::value));
-
-REGISTER_BINARY_CROSS_ENTROPY_REDUCE_MEAN_KERNEL(half)
-REGISTER_BINARY_CROSS_ENTROPY_REDUCE_MEAN_KERNEL(float)
-REGISTER_BINARY_CROSS_ENTROPY_REDUCE_MEAN_KERNEL(double)
-
-REGISTER_BINARY_CROSS_ENTROPY_REDUCE_MEAN_GRAD_KERNEL(half)
-REGISTER_BINARY_CROSS_ENTROPY_REDUCE_MEAN_GRAD_KERNEL(float)
-REGISTER_BINARY_CROSS_ENTROPY_REDUCE_MEAN_GRAD_KERNEL(double)
-
-}  // namespace user_op
+#include "hip/hip_runtime.h"
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "oneflow/user/kernels/binary_cross_entropy_with_logits_mean_kernel_util.h"
+#include "oneflow/core/ep/rocm/cuda_stream.h"
+#include "oneflow/core/hip/elementwise.hip.h"
+#include <hipcub/hipcub.hpp>
+#include "oneflow/core/kernel/cuda_graph_support.h"
+
+namespace oneflow {
+
+namespace user_op {
+
+namespace {
+
+constexpr int32_t kBlockSize = 1024;
+constexpr int32_t kReduceLocalSumBlockSize = 1024;
+constexpr int32_t kSingleBlockProcessNumThreshold = 1024;
+
+template<typename T>
+struct DefaultComputeType {
+  using type = T;
+};
+
+template<>
+struct DefaultComputeType<half> {
+  using type = float;
+};
+
+template<class Func>
+inline hipError_t GetNumBlocks(Func func, int64_t block_size, size_t dynamic_smem_size,
+                                int64_t max_blocks, int64_t waves, int* num_blocks) {
+  int dev;
+  {
+    hipError_t err = hipGetDevice(&dev);
+    if (err != hipSuccess) { return err; }
+  }
+  int sm_count;
+  {
+    hipError_t err = hipDeviceGetAttribute(&sm_count, hipDeviceAttributeMultiprocessorCount, dev);
+    if (err != hipSuccess) { return err; }
+  }
+  int max_active_blocks;
+  {
+    hipError_t err = hipOccupancyMaxActiveBlocksPerMultiprocessor(&max_active_blocks, func,
+                                                                    block_size, dynamic_smem_size);
+  }
+  *num_blocks =
+      std::max<int>(1, std::min<int64_t>(max_blocks, sm_count * max_active_blocks * waves));
+  return hipSuccess;
+}
+
+template<typename In, typename Out, typename ComputeType>
+__global__ void FusedBinaryCrossEntropyWithLogitsReduceMeanKernel(const In* input, const In* target,
+                                                                  Out* out,
+                                                                  const int32_t local_elem_cnt,
+                                                                  const int32_t reduce_elem_cnt) {
+  ComputeType zero = static_cast<ComputeType>(0.0);
+  ComputeType one = static_cast<ComputeType>(1.0);
+  using BlockReduce = hipcub::BlockReduce<ComputeType, kBlockSize>;
+  __shared__ typename BlockReduce::TempStorage temp_storage;
+  ComputeType reduce_sum = 0.0;
+  CUDA_1D_KERNEL_LOOP(i, local_elem_cnt) {
+    const ComputeType input_val = static_cast<ComputeType>(input[i]);
+    const ComputeType target_val = static_cast<ComputeType>(target[i]);
+    const ComputeType max_val = -input_val < zero ? zero : -input_val;
+    const ComputeType result =
+        (one - target_val) * input_val + max_val + (log(exp(-max_val) + exp(-input_val - max_val)));
+    reduce_sum += result;
+  }
+
+  const ComputeType block_reduce_sum = BlockReduce(temp_storage).Sum(reduce_sum);
+  if (threadIdx.x == 0) { out[blockIdx.x] = static_cast<Out>(block_reduce_sum / reduce_elem_cnt); }
+}
+
+template<typename Out, typename ComputeType>
+__global__ void ReduceLocalSumKernel(ComputeType* block_local_sum_buf, Out* out, int64_t elem_cnt) {
+  using BlockReduce = hipcub::BlockReduce<ComputeType, kReduceLocalSumBlockSize>;
+  __shared__ typename BlockReduce::TempStorage temp_storage;
+  ComputeType reduce_sum = 0.0;
+  CUDA_1D_KERNEL_LOOP(i, elem_cnt) { reduce_sum += block_local_sum_buf[i]; }
+  const ComputeType block_reduce_sum = BlockReduce(temp_storage).Sum(reduce_sum);
+  if (threadIdx.x == 0) { out[0] = static_cast<Out>(block_reduce_sum); }
+}
+
+template<typename T>
+__device__ __forceinline__ T Sigmoid(const T x) {
+  const T half_of_one = static_cast<T>(0.5);
+  return half_of_one * tanh(half_of_one * x) + half_of_one;
+}
+
+template<>
+__device__ __forceinline__ half Sigmoid(const half x) {
+  return __float2half(Sigmoid(__half2float(x)));
+}
+
+template<typename T, typename ComputeType>
+struct BinaryCrossEntropyWithLogitsReduceMeanGradFunctor {
+  OF_DEVICE_FUNC explicit BinaryCrossEntropyWithLogitsReduceMeanGradFunctor(
+      const T elem_cnt_reciprocal, const T dy)
+      : elem_cnt_reciprocal(elem_cnt_reciprocal), dy(dy) {}
+  __device__ T operator()(const T input_val, const T target_val) const {
+    return (Sigmoid(input_val) - target_val) * dy * elem_cnt_reciprocal;
+  }
+  const T dy;
+  const T elem_cnt_reciprocal;
+};
+
+template<typename T, typename ComputeType>
+struct BinaryCrossEntropyWithLogitsReduceMeanGradDyptrFunctor {
+  OF_DEVICE_FUNC explicit BinaryCrossEntropyWithLogitsReduceMeanGradDyptrFunctor(
+      const int32_t elem_cnt, const T* dy_ptr)
+      : elem_cnt_reciprocal(1.0f / elem_cnt), dy_ptr(dy_ptr) {}
+  __device__ BinaryCrossEntropyWithLogitsReduceMeanGradFunctor<T, ComputeType> operator()() const {
+    return BinaryCrossEntropyWithLogitsReduceMeanGradFunctor<T, ComputeType>(elem_cnt_reciprocal,
+                                                                             *dy_ptr);
+  }
+  const T* dy_ptr;
+  const T elem_cnt_reciprocal;
+};
+
+template<typename T>
+class BinaryCrossEntropyWithLogitsMeanKernel final : public user_op::OpKernel,
+                                                     public CudaGraphSupport {
+ public:
+  BinaryCrossEntropyWithLogitsMeanKernel() = default;
+  ~BinaryCrossEntropyWithLogitsMeanKernel() override = default;
+  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
+
+  std::shared_ptr<user_op::OpKernelCache> InitOpKernelCache(
+      user_op::KernelCacheContext* ctx) const override {
+    return CreateBCEWithLogitsReduceMeanKernelCache(ctx);
+  }
+
+ private:
+  using user_op::OpKernel::Compute;
+  void Compute(user_op::KernelComputeContext* ctx, user_op::OpKernelState* state,
+               const user_op::OpKernelCache* cache) const override {
+    const auto* input_blob = ctx->Tensor4ArgNameAndIndex("input", 0);
+    const auto* target_blob = ctx->Tensor4ArgNameAndIndex("target", 0);
+    auto* out_blob = ctx->Tensor4ArgNameAndIndex("out", 0);
+
+    int64_t local_elem_cnt = input_blob->shape_view().elem_cnt();
+    int64_t reduce_elem_cnt = local_elem_cnt;
+
+    if (cache != nullptr) {
+      // Because `out`'s SBP maybe P or B, we need to use reduce_elem_cnt as reduce_mean factor.
+      const auto* bce_cache = dynamic_cast<const BCEWithLogitsReduceMeanKernelCache*>(cache);
+      CHECK_NOTNULL(bce_cache);
+      reduce_elem_cnt = bce_cache->reduce_elem_cnt();
+    }
+
+    const T* input = input_blob->dptr<T>();
+    const T* target = target_blob->dptr<T>();
+    T* out = out_blob->mut_dptr<T>();
+    using ComputeType = typename DefaultComputeType<T>::type;
+
+    if (local_elem_cnt <= kSingleBlockProcessNumThreshold) {
+      FusedBinaryCrossEntropyWithLogitsReduceMeanKernel<T, T, ComputeType>
+          <<<1, kBlockSize, 0, ctx->stream()->As<ep::CudaStream>()->cuda_stream()>>>(
+              input_blob->dptr<T>(), target_blob->dptr<T>(), out_blob->mut_dptr<T>(),
+              local_elem_cnt, reduce_elem_cnt);
+    } else {
+      auto* tmp_buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0);
+      const int64_t tmp_buffer_elem_cnt = tmp_buffer->shape_view().elem_cnt() / sizeof(T);
+      const int64_t block_num = (local_elem_cnt + kBlockSize - 1) / kBlockSize;
+      int launch_block = block_num;
+      OF_CUDA_CHECK(GetNumBlocks(
+          FusedBinaryCrossEntropyWithLogitsReduceMeanKernel<T, ComputeType, ComputeType>,
+          kBlockSize, 0, block_num, 32, &launch_block));
+      launch_block = std::min<int32_t>(tmp_buffer_elem_cnt, launch_block);
+      FusedBinaryCrossEntropyWithLogitsReduceMeanKernel<T, ComputeType, ComputeType>
+          <<<launch_block, kBlockSize, 0, ctx->stream()->As<ep::CudaStream>()->cuda_stream()>>>(
+              input_blob->dptr<T>(), target_blob->dptr<T>(), tmp_buffer->mut_dptr<ComputeType>(),
+              local_elem_cnt, reduce_elem_cnt);
+      ReduceLocalSumKernel<T, ComputeType>
+          <<<1, kReduceLocalSumBlockSize, 0, ctx->stream()->As<ep::CudaStream>()->cuda_stream()>>>(
+              tmp_buffer->mut_dptr<ComputeType>(), out_blob->mut_dptr<T>(), block_num);
+    }
+  }
+};
+
+template<typename T>
+class BinaryCrossEntropyWithLogitsReduceMeanGradKernel final : public user_op::OpKernel {
+ public:
+  BinaryCrossEntropyWithLogitsReduceMeanGradKernel() = default;
+  ~BinaryCrossEntropyWithLogitsReduceMeanGradKernel() = default;
+  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
+
+  std::shared_ptr<user_op::OpKernelCache> InitOpKernelCache(
+      user_op::KernelCacheContext* ctx) const override {
+    return CreateBCEWithLogitsReduceMeanKernelCache(ctx);
+  }
+
+ private:
+  using user_op::OpKernel::Compute;
+  void Compute(user_op::KernelComputeContext* ctx, user_op::OpKernelState* state,
+               const user_op::OpKernelCache* cache) const override {
+    const auto* input_blob = ctx->Tensor4ArgNameAndIndex("input", 0);
+    const auto* target_blob = ctx->Tensor4ArgNameAndIndex("target", 0);
+    const auto* dy_blob = ctx->Tensor4ArgNameAndIndex("dy", 0);
+    auto* dx_blob = ctx->Tensor4ArgNameAndIndex("dx", 0);
+
+    int64_t local_elem_cnt = input_blob->shape_view().elem_cnt();
+    int64_t reduce_elem_cnt = local_elem_cnt;
+    if (cache != nullptr) {
+      // Because `out`'s SBP maybe P or B, we need to use reduce_elem_cnt as reduce_mean factor.
+      const auto* bce_cache = dynamic_cast<const BCEWithLogitsReduceMeanKernelCache*>(cache);
+      CHECK_NOTNULL(bce_cache);
+      reduce_elem_cnt = bce_cache->reduce_elem_cnt();
+    }
+
+    const T* dy = dy_blob->dptr<T>();
+    const T* input = input_blob->dptr<T>();
+    const T* target = target_blob->dptr<T>();
+    T* dx = dx_blob->mut_dptr<T>();
+    using ComputeType = typename DefaultComputeType<T>::type;
+
+    OF_CUDA_CHECK((cuda::elementwise::BinaryWithFactory(
+        BinaryCrossEntropyWithLogitsReduceMeanGradDyptrFunctor<T, ComputeType>(reduce_elem_cnt, dy),
+        local_elem_cnt, dx, input, target, ctx->stream()->As<ep::CudaStream>()->cuda_stream())));
+  }
+};
+
+}  // namespace
+
+#define REGISTER_BINARY_CROSS_ENTROPY_REDUCE_MEAN_KERNEL(dtype)                                 \
+  REGISTER_USER_KERNEL("binary_cross_entropy_with_logits_reduce_mean")                          \
+      .SetCreateFn<BinaryCrossEntropyWithLogitsMeanKernel<dtype>>()                             \
+      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA)                          \
+                       && (user_op::HobDataType("input", 0) == GetDataType<dtype>::value)       \
+                       && (user_op::HobDataType("target", 0) == GetDataType<dtype>::value)      \
+                       && (user_op::HobDataType("out", 0) == GetDataType<dtype>::value))        \
+      .SetInferTmpSizeFn([](user_op::InferContext* ctx) {                                       \
+        const int64_t elem_cnt = ctx->InputShape("input", 0).elem_cnt();                        \
+        const int64_t block_num = (elem_cnt + kBlockSize - 1) / kBlockSize;                     \
+        int launch_block = block_num;                                                           \
+        using ComputeType = typename DefaultComputeType<dtype>::type;                           \
+        OF_CUDA_CHECK(GetNumBlocks(                                                             \
+            FusedBinaryCrossEntropyWithLogitsReduceMeanKernel<dtype, ComputeType, ComputeType>, \
+            kBlockSize, 0, block_num, 32, &launch_block));                                      \
+        const int64_t tmp_buffer_size = GetCudaAlignedSize(launch_block * sizeof(dtype));       \
+        return tmp_buffer_size;                                                                 \
+      });
+
+#define REGISTER_BINARY_CROSS_ENTROPY_REDUCE_MEAN_GRAD_KERNEL(dtype)                       \
+  REGISTER_USER_KERNEL("binary_cross_entropy_with_logits_reduce_mean_grad")                \
+      .SetCreateFn<BinaryCrossEntropyWithLogitsReduceMeanGradKernel<dtype>>()              \
+      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA)                     \
+                       && (user_op::HobDataType("input", 0) == GetDataType<dtype>::value)  \
+                       && (user_op::HobDataType("target", 0) == GetDataType<dtype>::value) \
+                       && (user_op::HobDataType("dy", 0) == GetDataType<dtype>::value)     \
+                       && (user_op::HobDataType("dx", 0) == GetDataType<dtype>::value));
+
+REGISTER_BINARY_CROSS_ENTROPY_REDUCE_MEAN_KERNEL(half)
+REGISTER_BINARY_CROSS_ENTROPY_REDUCE_MEAN_KERNEL(float)
+REGISTER_BINARY_CROSS_ENTROPY_REDUCE_MEAN_KERNEL(double)
+
+REGISTER_BINARY_CROSS_ENTROPY_REDUCE_MEAN_GRAD_KERNEL(half)
+REGISTER_BINARY_CROSS_ENTROPY_REDUCE_MEAN_GRAD_KERNEL(float)
+REGISTER_BINARY_CROSS_ENTROPY_REDUCE_MEAN_GRAD_KERNEL(double)
+
+}  // namespace user_op
 }  // namespace oneflow
\ No newline at end of file
--- a/oneflow/user/kernels/broadcast_pow_grad_kernel.hip.cpp
+++ b/oneflow/user/kernels/broadcast_pow_grad_kernel.hip.cpp
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#include "hip/hip_runtime.h"
-#include "oneflow/core/kernel/new_kernel_util.h"
-#include "oneflow/core/kernel/kernel_util.hip.h"
-#include "oneflow/core/framework/framework.h"
-#include "oneflow/core/ndarray/ndarray_util.h"
-#include "oneflow/core/ndarray/xpu_var_ndarray.h"
-#include "oneflow/core/ep/rocm/cuda_stream.h"
-
-namespace oneflow {
-namespace {
-template<typename T>
-__global__ void ComputeLogGpu(const int64_t len, T* out, const T* in) {
-  CUDA_1D_KERNEL_LOOP(i, len) { out[i] = SafeLog(in[i]); }
-}
-template<>
-__global__ void ComputeLogGpu<float16>(const int64_t len, float16* out, const float16* in) {
-  const half* _in = reinterpret_cast<const half*>(in);
-  half* _out = reinterpret_cast<half*>(out);
-  CUDA_1D_KERNEL_LOOP(i, len) { _out[i] = SafeLog(_in[i]); }
-}
-
-template<DeviceType device, typename T>
-class BroadcastPowYGradKernel final : public user_op::OpKernel {
- public:
-  BroadcastPowYGradKernel() = default;
-  ~BroadcastPowYGradKernel() = default;
-
- private:
-  using user_op::OpKernel::Compute;
-  void Compute(user_op::KernelComputeContext* ctx) const override {
-    const user_op::Tensor* x_tensor = ctx->Tensor4ArgNameAndIndex("x", 0);
-    const user_op::Tensor* z_tensor = ctx->Tensor4ArgNameAndIndex("z", 0);
-    const user_op::Tensor* dz_tensor = ctx->Tensor4ArgNameAndIndex("dz", 0);
-    user_op::Tensor* tmp_buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0);
-    user_op::Tensor* dy_tensor = ctx->Tensor4ArgNameAndIndex("dy", 0);
-
-    const int64_t num_axes = dz_tensor->shape_view().NumAxes();
-    const int64_t elem_cnt = z_tensor->shape_view().elem_cnt();
-    Memset<device>(ctx->stream(), tmp_buffer->mut_dptr<T>(), 0,
-                   GetCudaAlignedSize(elem_cnt * sizeof(T)));
-    XpuVarNdarray<const T> z(z_tensor->shape_view(), z_tensor->dptr<T>(), num_axes);
-    XpuVarNdarray<const T> dz(dz_tensor->shape_view(), dz_tensor->dptr<T>(), num_axes);
-    XpuVarNdarray<const T> const_tmp(dz.shape(), tmp_buffer->dptr<T>());
-    XpuVarNdarray<T> tmp(dz.shape(), tmp_buffer->mut_dptr<T>());
-    XpuVarNdarray<const T> x(x_tensor->shape_view(), x_tensor->dptr<T>(), num_axes);
-    XpuVarNdarray<T> dy(dy_tensor->shape_view(), dy_tensor->mut_dptr<T>(), num_axes);
-    NdarrayUtil<device, T>::BroadcastAdd(ctx->stream(), tmp, x, const_tmp);
-    ComputeLogGpu<T><<<BlocksNum4ThreadsNum(elem_cnt), kCudaThreadsNumPerBlock, 0,
-                       ctx->stream()->As<ep::CudaStream>()->cuda_stream()>>>(
-        elem_cnt, tmp_buffer->mut_dptr<T>(), tmp_buffer->dptr<T>());
-    NdarrayUtil<device, T>::BroadcastMul(ctx->stream(), tmp, dz, const_tmp);
-    NdarrayUtil<device, T>::BroadcastMul(ctx->stream(), tmp, z, const_tmp);
-    NdarrayUtil<device, T>::ReduceSum(ctx->stream(), dy, const_tmp, tmp);
-  }
-  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
-};
-
-}  // namespace
-#define REGISTER_BROADCAST_POW_Y_GRAD_KERNEL(device, dtype_pair)                           \
-  REGISTER_USER_KERNEL("broadcast_pow_y_grad")                                             \
-      .SetCreateFn<BroadcastPowYGradKernel<device, OF_PP_PAIR_FIRST(dtype_pair)>>()        \
-      .SetIsMatchedHob((user_op::HobDeviceType() == device)                                \
-                       && (user_op::HobDataType("x", 0) == OF_PP_PAIR_SECOND(dtype_pair))) \
-      .SetInferTmpSizeFn([](oneflow::user_op::InferContext* ctx) {                         \
-        const user_op::TensorDesc& z = ctx->InputTensorDesc("z", 0);                       \
-        const DataType& data_type = z.data_type();                                         \
-        const int64_t elem_cnt = z.shape().elem_cnt();                                     \
-        return GetCudaAlignedSize(elem_cnt * GetSizeOfDataType(data_type));                \
-      });
-
-OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(REGISTER_BROADCAST_POW_Y_GRAD_KERNEL, (DeviceType::kCUDA),
-                                 ARITHMETIC_DATA_TYPE_SEQ FLOAT16_DATA_TYPE_SEQ)
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "hip/hip_runtime.h"
+#include "oneflow/core/kernel/new_kernel_util.h"
+#include "oneflow/core/kernel/kernel_util.hip.h"
+#include "oneflow/core/framework/framework.h"
+#include "oneflow/core/ndarray/ndarray_util.h"
+#include "oneflow/core/ndarray/xpu_var_ndarray.h"
+#include "oneflow/core/ep/rocm/cuda_stream.h"
+
+namespace oneflow {
+namespace {
+template<typename T>
+__global__ void ComputeLogGpu(const int64_t len, T* out, const T* in) {
+  CUDA_1D_KERNEL_LOOP(i, len) { out[i] = SafeLog(in[i]); }
+}
+template<>
+__global__ void ComputeLogGpu<float16>(const int64_t len, float16* out, const float16* in) {
+  const half* _in = reinterpret_cast<const half*>(in);
+  half* _out = reinterpret_cast<half*>(out);
+  CUDA_1D_KERNEL_LOOP(i, len) { _out[i] = SafeLog(_in[i]); }
+}
+
+template<DeviceType device, typename T>
+class BroadcastPowYGradKernel final : public user_op::OpKernel {
+ public:
+  BroadcastPowYGradKernel() = default;
+  ~BroadcastPowYGradKernel() = default;
+
+ private:
+  using user_op::OpKernel::Compute;
+  void Compute(user_op::KernelComputeContext* ctx) const override {
+    const user_op::Tensor* x_tensor = ctx->Tensor4ArgNameAndIndex("x", 0);
+    const user_op::Tensor* z_tensor = ctx->Tensor4ArgNameAndIndex("z", 0);
+    const user_op::Tensor* dz_tensor = ctx->Tensor4ArgNameAndIndex("dz", 0);
+    user_op::Tensor* tmp_buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0);
+    user_op::Tensor* dy_tensor = ctx->Tensor4ArgNameAndIndex("dy", 0);
+
+    const int64_t num_axes = dz_tensor->shape_view().NumAxes();
+    const int64_t elem_cnt = z_tensor->shape_view().elem_cnt();
+    Memset<device>(ctx->stream(), tmp_buffer->mut_dptr<T>(), 0,
+                   GetCudaAlignedSize(elem_cnt * sizeof(T)));
+    XpuVarNdarray<const T> z(z_tensor->shape_view(), z_tensor->dptr<T>(), num_axes);
+    XpuVarNdarray<const T> dz(dz_tensor->shape_view(), dz_tensor->dptr<T>(), num_axes);
+    XpuVarNdarray<const T> const_tmp(dz.shape(), tmp_buffer->dptr<T>());
+    XpuVarNdarray<T> tmp(dz.shape(), tmp_buffer->mut_dptr<T>());
+    XpuVarNdarray<const T> x(x_tensor->shape_view(), x_tensor->dptr<T>(), num_axes);
+    XpuVarNdarray<T> dy(dy_tensor->shape_view(), dy_tensor->mut_dptr<T>(), num_axes);
+    NdarrayUtil<device, T>::BroadcastAdd(ctx->stream(), tmp, x, const_tmp);
+    ComputeLogGpu<T><<<BlocksNum4ThreadsNum(elem_cnt), kCudaThreadsNumPerBlock, 0,
+                       ctx->stream()->As<ep::CudaStream>()->cuda_stream()>>>(
+        elem_cnt, tmp_buffer->mut_dptr<T>(), tmp_buffer->dptr<T>());
+    NdarrayUtil<device, T>::BroadcastMul(ctx->stream(), tmp, dz, const_tmp);
+    NdarrayUtil<device, T>::BroadcastMul(ctx->stream(), tmp, z, const_tmp);
+    NdarrayUtil<device, T>::ReduceSum(ctx->stream(), dy, const_tmp, tmp);
+  }
+  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
+};
+
+}  // namespace
+#define REGISTER_BROADCAST_POW_Y_GRAD_KERNEL(device, dtype_pair)                           \
+  REGISTER_USER_KERNEL("broadcast_pow_y_grad")                                             \
+      .SetCreateFn<BroadcastPowYGradKernel<device, OF_PP_PAIR_FIRST(dtype_pair)>>()        \
+      .SetIsMatchedHob((user_op::HobDeviceType() == device)                                \
+                       && (user_op::HobDataType("x", 0) == OF_PP_PAIR_SECOND(dtype_pair))) \
+      .SetInferTmpSizeFn([](oneflow::user_op::InferContext* ctx) {                         \
+        const user_op::TensorDesc& z = ctx->InputTensorDesc("z", 0);                       \
+        const DataType& data_type = z.data_type();                                         \
+        const int64_t elem_cnt = z.shape().elem_cnt();                                     \
+        return GetCudaAlignedSize(elem_cnt * GetSizeOfDataType(data_type));                \
+      });
+
+OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(REGISTER_BROADCAST_POW_Y_GRAD_KERNEL, (DeviceType::kCUDA),
+                                 ARITHMETIC_DATA_TYPE_SEQ FLOAT16_DATA_TYPE_SEQ)
 }  // namespace oneflow
\ No newline at end of file
--- a/oneflow/user/kernels/categorical_ordinal_encode_kernel_util.hip.cpp
+++ b/oneflow/user/kernels/categorical_ordinal_encode_kernel_util.hip.cpp
-#include "hip/hip_runtime.h"
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#ifdef NDEBUG
-#undef NDEBUG
-#endif
-#include <assert.h>
-#include "oneflow/user/kernels/categorical_ordinal_encode_kernel_util.h"
-#include "oneflow/core/kernel/kernel_util.hip.h"
-#include "oneflow/core/ep/rocm/cuda_stream.h"
-
-namespace oneflow {
-
-namespace {
-
-using CuInt64T = unsigned long long int;
-
-__device__ __inline__ int32_t AtomicCAS(int32_t* address, int32_t compare, int32_t val) {
-  return atomicCAS(address, compare, val);
-}
-
-__device__ __inline__ int64_t AtomicCAS(int64_t* address, int64_t compare, int64_t val) {
-  static_assert(sizeof(int64_t) == sizeof(CuInt64T), "size error");
-  return static_cast<int64_t>(atomicCAS(reinterpret_cast<CuInt64T*>(address),
-                                        static_cast<CuInt64T>(compare),
-                                        static_cast<CuInt64T>(val)));
-}
-
-__device__ __inline__ int32_t AtomicAdd(int32_t* address, int32_t val) {
-  return atomicAdd(address, val);
-}
-
-__device__ __inline__ int64_t AtomicAdd(int64_t* address, int64_t val) {
-  static_assert(sizeof(int64_t) == sizeof(CuInt64T), "size error");
-  return static_cast<int64_t>(
-      atomicAdd(reinterpret_cast<CuInt64T*>(address), static_cast<CuInt64T>(val)));
-}
-
-template<typename K, typename V>
-__device__ bool TryGetOrInsert(K* key, volatile V* value, V* size, const K hash, V* out) {
-  K old_key = AtomicCAS(key, static_cast<K>(0), hash);
-  if (old_key == 0) {
-    V v = AtomicAdd(size, 1) + 1;
-    *value = v;
-    *out = v;
-    return true;
-  } else if (old_key == hash) {
-    while (true) {
-      V v = *value;
-      if (v != 0) {
-        *out = v;
-        break;
-      }
-    }
-    return true;
-  } else {
-    return false;
-  }
-}
-
-template<typename T>
-__device__ bool GetOrInsertOne(const size_t capacity, T* table, T* size, const T hash, T* out) {
-  if (hash == 0) {
-    *out = 0;
-    return true;
-  }
-  const size_t start_idx = static_cast<size_t>(hash) % capacity;
-  // fast path
-  {
-    T* key = table + start_idx * 2;
-    T* value = key + 1;
-    if (*key == hash && *value != 0) {
-      *out = *value;
-      return true;
-    }
-  }
-  for (size_t count = 0; count < capacity; ++count) {
-    const size_t idx = (start_idx + count) % capacity;
-    T* key = table + idx * 2;
-    T* value = key + 1;
-    if (TryGetOrInsert<T, T>(key, value, size, hash, out)) { return true; }
-  }
-  return false;
-}
-
-template<typename T>
-__global__ void EncodeGpu(const size_t capacity, T* table, T* size, const int64_t n, const T* hash,
-                          T* out) {
-  CUDA_1D_KERNEL_LOOP(i, n) {
-    bool success = GetOrInsertOne<T>(capacity, table, size, hash[i], out + i);
-    assert(success);
-  }
-}
-
-}  // namespace
-
-template<typename T>
-struct CategoricalOrdinalEncodeKernelUtil<DeviceType::kCUDA, T> {
-  static void Encode(ep::Stream* stream, int64_t capacity, T* table, T* size, int64_t n,
-                     const T* hash, T* out) {
-    EncodeGpu<T>
-        <<<BlocksNum4ThreadsNum(n), kCudaThreadsNumPerBlock, 0,
-           stream->As<ep::CudaStream>()->cuda_stream()>>>(capacity, table, size, n, hash, out);
-  }
-};
-
-#define INSTANTIATE_CATEGORICAL_ORDINAL_ENCODE_KERNEL_UTIL_CUDA(type_cpp, type_proto) \
-  template struct CategoricalOrdinalEncodeKernelUtil<DeviceType::kCUDA, type_cpp>;
-OF_PP_FOR_EACH_TUPLE(INSTANTIATE_CATEGORICAL_ORDINAL_ENCODE_KERNEL_UTIL_CUDA, INDEX_DATA_TYPE_SEQ);
-#undef INSTANTIATE_CATEGORICAL_ORDINAL_ENCODE_KERNEL_UTIL_CUDA
-
+#include "hip/hip_runtime.h"
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#ifdef NDEBUG
+#undef NDEBUG
+#endif
+#include <assert.h>
+#include "oneflow/user/kernels/categorical_ordinal_encode_kernel_util.h"
+#include "oneflow/core/kernel/kernel_util.hip.h"
+#include "oneflow/core/ep/rocm/cuda_stream.h"
+
+namespace oneflow {
+
+namespace {
+
+using CuInt64T = unsigned long long int;
+
+__device__ __inline__ int32_t AtomicCAS(int32_t* address, int32_t compare, int32_t val) {
+  return atomicCAS(address, compare, val);
+}
+
+__device__ __inline__ int64_t AtomicCAS(int64_t* address, int64_t compare, int64_t val) {
+  static_assert(sizeof(int64_t) == sizeof(CuInt64T), "size error");
+  return static_cast<int64_t>(atomicCAS(reinterpret_cast<CuInt64T*>(address),
+                                        static_cast<CuInt64T>(compare),
+                                        static_cast<CuInt64T>(val)));
+}
+
+__device__ __inline__ int32_t AtomicAdd(int32_t* address, int32_t val) {
+  return atomicAdd(address, val);
+}
+
+__device__ __inline__ int64_t AtomicAdd(int64_t* address, int64_t val) {
+  static_assert(sizeof(int64_t) == sizeof(CuInt64T), "size error");
+  return static_cast<int64_t>(
+      atomicAdd(reinterpret_cast<CuInt64T*>(address), static_cast<CuInt64T>(val)));
+}
+
+template<typename K, typename V>
+__device__ bool TryGetOrInsert(K* key, volatile V* value, V* size, const K hash, V* out) {
+  K old_key = AtomicCAS(key, static_cast<K>(0), hash);
+  if (old_key == 0) {
+    V v = AtomicAdd(size, 1) + 1;
+    *value = v;
+    *out = v;
+    return true;
+  } else if (old_key == hash) {
+    while (true) {
+      V v = *value;
+      if (v != 0) {
+        *out = v;
+        break;
+      }
+    }
+    return true;
+  } else {
+    return false;
+  }
+}
+
+template<typename T>
+__device__ bool GetOrInsertOne(const size_t capacity, T* table, T* size, const T hash, T* out) {
+  if (hash == 0) {
+    *out = 0;
+    return true;
+  }
+  const size_t start_idx = static_cast<size_t>(hash) % capacity;
+  // fast path
+  {
+    T* key = table + start_idx * 2;
+    T* value = key + 1;
+    if (*key == hash && *value != 0) {
+      *out = *value;
+      return true;
+    }
+  }
+  for (size_t count = 0; count < capacity; ++count) {
+    const size_t idx = (start_idx + count) % capacity;
+    T* key = table + idx * 2;
+    T* value = key + 1;
+    if (TryGetOrInsert<T, T>(key, value, size, hash, out)) { return true; }
+  }
+  return false;
+}
+
+template<typename T>
+__global__ void EncodeGpu(const size_t capacity, T* table, T* size, const int64_t n, const T* hash,
+                          T* out) {
+  CUDA_1D_KERNEL_LOOP(i, n) {
+    bool success = GetOrInsertOne<T>(capacity, table, size, hash[i], out + i);
+    assert(success);
+  }
+}
+
+}  // namespace
+
+template<typename T>
+struct CategoricalOrdinalEncodeKernelUtil<DeviceType::kCUDA, T> {
+  static void Encode(ep::Stream* stream, int64_t capacity, T* table, T* size, int64_t n,
+                     const T* hash, T* out) {
+    EncodeGpu<T>
+        <<<BlocksNum4ThreadsNum(n), kCudaThreadsNumPerBlock, 0,
+           stream->As<ep::CudaStream>()->cuda_stream()>>>(capacity, table, size, n, hash, out);
+  }
+};
+
+#define INSTANTIATE_CATEGORICAL_ORDINAL_ENCODE_KERNEL_UTIL_CUDA(type_cpp, type_proto) \
+  template struct CategoricalOrdinalEncodeKernelUtil<DeviceType::kCUDA, type_cpp>;
+OF_PP_FOR_EACH_TUPLE(INSTANTIATE_CATEGORICAL_ORDINAL_ENCODE_KERNEL_UTIL_CUDA, INDEX_DATA_TYPE_SEQ);
+#undef INSTANTIATE_CATEGORICAL_ORDINAL_ENCODE_KERNEL_UTIL_CUDA
+
 }  // namespace oneflow
\ No newline at end of file