Surpport profiler for DCU, surpport debug compiler

f262efc9 · yuguo · 3f56062c · f262efc9 · f262efc9 · f262efc9
Commit f262efc9 authored Nov 21, 2022 by yuguo
17 changed files
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -265,9 +265,9 @@ set(ROBIN_HOOD_HASHING_URL
 use_mirror(VARIABLE ROBIN_HOOD_HASHING_URL URL ${ROBIN_HOOD_HASHING_URL})
 set(ROBIN_HOOD_HASHING_MD5 a78bd30a7582f25984f8592652836467)

-set(FMT_URL https://github.com/fmtlib/fmt/archive/48b7e3dafb27ece02cd6addc8bd1041c79d59c2c.zip)
+set(FMT_URL https://github.com/fmtlib/fmt/archive/fc07217d85e6dcec52878807d6bbd89a9d9156a5.zip)
 use_mirror(VARIABLE FMT_URL URL ${FMT_URL})
-set(FMT_MD5 45925a979ed7195e0c88a70be691de09)
+set(FMT_MD5 7d9bb2ececc9ede29cd35bdc42a7e22c)

 set(KINETO_URL
    https://github.com/pytorch/kineto/archive/ff8dba20499a660650632952be76450bd70a52a6.zip)

--- a/cmake/third_party.cmake
+++ b/cmake/third_party.cmake
@@ -175,6 +175,8 @@ if (BUILD_ROCM)
  add_definitions(-D__HIP_PLATFORM_HCC__)
  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D__HIP_PLATFORM_HCC__ --gpu-max-threads-per-block=1024")
  set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -D__HIP_PLATFORM_HCC__ --gpu-max-threads-per-block=1024")
+  set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -mcmodel=large")
+  set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} -mcmodel=large")
  list(APPEND oneflow_third_party_libs hip::device)
  list(APPEND oneflow_third_party_libs roc::hipblas)
  list(APPEND oneflow_third_party_libs hip::hipcub)

--- a/oneflow/core/profiler/event.cpp
+++ b/oneflow/core/profiler/event.cpp
@@ -14,8 +14,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 */

-// #include "fmt/core.h"
-// #include "fmt/format.h"
+#include "fmt/core.h"
+#include "fmt/format.h"
 #include "oneflow/core/profiler/event.h"
 #include "oneflow/core/profiler/util.h"

@@ -58,14 +58,13 @@ std::shared_ptr<CustomEvent> CustomEvent::Create(const std::string& name, Custom
  return std::shared_ptr<CustomEvent>(new CustomEvent(name, type));
 }

-// std::string KernelEvent::Key() { return fmt::format("{}.{}", name_, GetFormatedInputShapes()); }
-std::string KernelEvent::Key() { return "yuguo"; }
+std::string KernelEvent::Key() { return fmt::format("{}.{}", name_, GetFormatedInputShapes()); }

 nlohmann::json KernelEvent::ToJson() {
  auto j = IEvent::ToJson();
  j["type"] = EventType::kOneflowKernel;
  j["input_shapes"] = GetFormatedInputShapes();
-#if defined(WITH_CUDA)
+#if defined(WITH_CUDA) || defined(WITH_ROCM)
  j["memory_size"] = memory_size_;
  if (!children_.empty()) { j["children"] = children_; }
 #endif  // WITH_CUDA
@@ -73,12 +72,10 @@ nlohmann::json KernelEvent::ToJson() {
 }

 std::shared_ptr<KernelEvent> KernelEvent::Create(
-    const std::string& name, const std::function<std::vector<ShapeView>(void)>& shape_getter) {
+    const std::string& name, const std::function<std::vector<Shape>(void)>& shape_getter) {
  return std::shared_ptr<KernelEvent>(new KernelEvent(name, shape_getter));
 }

-void KernelEvent::RecordShape(const ShapeView& shape) { input_shapes_.emplace_back(shape); }
-
 std::string KernelEvent::GetFormatedInputShapes(size_t max_num_to_format) {
  if (input_shapes_.size() == 0) { return "-"; }
  std::vector<std::string> shapes_formated(std::min(input_shapes_.size(), max_num_to_format));
@@ -87,8 +84,7 @@ std::string KernelEvent::GetFormatedInputShapes(size_t max_num_to_format) {
    shapes_formated[i] = current_shape == "()" ? "scalar" : current_shape;
  }
  if (input_shapes_.size() > max_num_to_format) { shapes_formated.emplace_back("..."); }
-  // return fmt::format("[{}]", fmt::join(shapes_formated, ", "));
-  return "yuguo";
+  return fmt::format("[{}]", fmt::join(shapes_formated, ", "));
 }

 }  // namespace profiler

--- a/oneflow/core/profiler/event.h
+++ b/oneflow/core/profiler/event.h
@@ -138,11 +138,9 @@ class KernelEvent final : public IEvent {
  nlohmann::json ToJson() override;

  static std::shared_ptr<KernelEvent> Create(
-      const std::string& name, const std::function<std::vector<ShapeView>(void)>& shape_getter);
+      const std::string& name, const std::function<std::vector<Shape>(void)>& shape_getter);

-  void RecordShape(const ShapeView& shape);
-
-#if defined(WITH_CUDA)
+#if defined(WITH_CUDA) || defined(WITH_ROCM)
  void SetMemorySize(int64_t memory_size) { memory_size_ = memory_size; }
  void AddChildEvent(const std::shared_ptr<IEvent>& e) { children_.emplace(e); }
  bool AddChildEventIfSo(const std::shared_ptr<IEvent>& e) {
@@ -160,17 +158,17 @@ class KernelEvent final : public IEvent {

 private:
  KernelEvent(const std::string& kernel_name,
-              const std::function<std::vector<ShapeView>(void)>& shape_getter)
+              const std::function<std::vector<Shape>(void)>& shape_getter)
      : IEvent(kernel_name, EventTimeUnit::kNS) {
    if (shape_getter) { input_shapes_ = shape_getter(); }
  }

-#if defined(WITH_CUDA)
+#if defined(WITH_CUDA) || defined(WITH_ROCM)
  int64_t memory_size_ = -1;
  std::set<std::shared_ptr<IEvent>> children_;
 #endif  // WITH_CUDA

-  std::vector<ShapeView> input_shapes_;
+  std::vector<Shape> input_shapes_;
  std::string GetFormatedInputShapes(size_t max_num_to_format = 4);
 };


--- a/oneflow/core/profiler/event_recorder.cpp
+++ b/oneflow/core/profiler/event_recorder.cpp
@@ -32,13 +32,13 @@ std::shared_ptr<EventRecorder> EventRecorder::CreateCustomEventRecorder(const st

 Maybe<EventRecorder> EventRecorder::CreateKernelEventRecorder(
    const std::string& name,
-#if defined(WITH_CUDA)
+#if defined(WITH_CUDA) || defined(WITH_ROCM)
    const std::function<int64_t()>& memory_size_getter,
 #endif
    const ShapeGetterFuncType& shape_getter) {
  auto pmgr = Singleton<ProfileManager>::Get();
  if (pmgr) {
-#if defined(WITH_CUDA)
+#if defined(WITH_CUDA) || defined(WITH_ROCM)
    if (pmgr->use_cpu_ || pmgr->use_cuda_) {
      auto event = KernelEvent::Create(name, pmgr->record_shapes_ ? shape_getter : nullptr);
      if (pmgr->use_cuda_) {

--- a/oneflow/core/profiler/event_recorder.h
+++ b/oneflow/core/profiler/event_recorder.h
@@ -24,7 +24,7 @@ namespace profiler {

 class EventRecorder {
 public:
-  using ShapeGetterFuncType = std::function<std::vector<ShapeView>(void)>;
+  using ShapeGetterFuncType = std::function<std::vector<Shape>(void)>;

  OF_DISALLOW_COPY_AND_MOVE(EventRecorder);

@@ -45,7 +45,7 @@ class EventRecorder {

  static Maybe<EventRecorder> CreateKernelEventRecorder(
      const std::string& name,
-#if defined(WITH_CUDA)
+#if defined(WITH_CUDA) || defined(WITH_ROCM)
      const std::function<int64_t()>& memory_size_getter,
 #endif
      const ShapeGetterFuncType& shape_getter);

--- a/oneflow/core/profiler/kernel.cpp
+++ b/oneflow/core/profiler/kernel.cpp
@@ -17,7 +17,11 @@ limitations under the License.
 #include "oneflow/core/profiler/kernel.h"
 #include "oneflow/core/profiler/profiler.h"
 #include "oneflow/core/kernel/kernel.h"
+#ifdef WITH_ROCM
+#include "oneflow/core/ep/rocm/cuda_stream.h"
+#else
 #include "oneflow/core/ep/cuda/cuda_stream.h"
+#endif
 #include "oneflow/core/lazy/actor/actor_context.h"

 namespace oneflow {
@@ -43,6 +47,11 @@ thread_local cudaEvent_t cuda_memory_bandwidth_profile_start_event = nullptr;
 thread_local cudaEvent_t cuda_memory_bandwidth_profile_end_event = nullptr;
 #endif  // WITH_CUDA

+#if defined(WITH_ROCM)
+thread_local hipEvent_t cuda_memory_bandwidth_profile_start_event = nullptr;
+thread_local hipEvent_t cuda_memory_bandwidth_profile_end_event = nullptr;
+#endif  // WITH_ROCM
+
 }  // namespace

 void TraceKernelForwardDataContentStart(KernelContext* kernel_ctx, const Kernel* kernel) {
@@ -61,6 +70,22 @@ void TraceKernelForwardDataContentStart(KernelContext* kernel_ctx, const Kernel*
  }
  if (profile_kernel_forward_range) { OF_PROFILER_RANGE_PUSH(kernel->op_conf().name()); }
 #endif  // WITH_CUDA
+#if defined(WITH_ROCM)
+  if (profile_cuda_memory_bandwidth) {
+    auto* actor_context_provider = dynamic_cast<ActorContextProvider*>(kernel_ctx);
+    auto* cuda_stream = dynamic_cast<ep::CudaStream*>(kernel_ctx->stream());
+    if (cuda_stream != nullptr && actor_context_provider != nullptr) {
+      CHECK(cuda_memory_bandwidth_profile_start_event == nullptr);
+      CHECK(cuda_memory_bandwidth_profile_end_event == nullptr);
+      OF_CUDA_CHECK(hipEventCreate(&cuda_memory_bandwidth_profile_start_event));
+      OF_CUDA_CHECK(hipEventCreate(&cuda_memory_bandwidth_profile_end_event));
+      OF_CUDA_CHECK(
+          hipEventRecord(cuda_memory_bandwidth_profile_start_event, cuda_stream->cuda_stream()));
+    }
+  }
+  if (profile_kernel_forward_range) { OF_PROFILER_RANGE_PUSH(kernel->op_conf().name()); }
+
+#endif  // WITH_ROCM
 }

 void TraceKernelForwardDataContentEnd(KernelContext* kernel_ctx, const Kernel* kernel) {
@@ -103,6 +128,45 @@ void TraceKernelForwardDataContentEnd(KernelContext* kernel_ctx, const Kernel* k
    }
  }
 #endif  // WITH_CUDA
+#if defined(WITH_ROCM)
+  if (profile_kernel_forward_range) { OF_PROFILER_RANGE_POP(); }
+  // The memory bandwidth profiler only works in lazy mode.
+  if (profile_cuda_memory_bandwidth) {
+    auto* cuda_stream = dynamic_cast<ep::CudaStream*>(kernel_ctx->stream());
+    auto* actor_context_provider = dynamic_cast<ActorContextProvider*>(kernel_ctx);
+    if (cuda_stream != nullptr && actor_context_provider != nullptr) {
+      hipEvent_t start_event = cuda_memory_bandwidth_profile_start_event;
+      hipEvent_t end_event = cuda_memory_bandwidth_profile_end_event;
+      cuda_memory_bandwidth_profile_start_event = nullptr;
+      cuda_memory_bandwidth_profile_end_event = nullptr;
+      CHECK_NOTNULL(start_event);
+      CHECK_NOTNULL(end_event);
+      OF_CUDA_CHECK(hipEventRecord(end_event, cuda_stream->cuda_stream()));
+      int64_t memory_size = 0;
+      for (const auto& bn : kernel->op_attribute().input_bns()) {
+        const Blob* blob = kernel_ctx->BnInOp2Blob(bn);
+        if (blob) { memory_size += blob->ByteSizeOfBlobBody(); }
+      }
+      for (const auto& bn : kernel->op_attribute().output_bns()) {
+        const Blob* blob = kernel_ctx->BnInOp2Blob(bn);
+        if (blob) { memory_size += blob->ByteSizeOfBlobBody(); }
+      }
+      const std::string op_name = kernel->op_conf().name();
+      actor_context_provider->GetActorContext()->AddCallback(
+          [start_event, end_event, memory_size, op_name]() {
+            float elapsed_ms = 0;
+            OF_CUDA_CHECK(hipEventElapsedTime(&elapsed_ms, start_event, end_event));
+            OF_CUDA_CHECK(hipEventDestroy(start_event));
+            OF_CUDA_CHECK(hipEventDestroy(end_event));
+            double bandwidth =
+                static_cast<double>(memory_size) / (1024.0 * 1024.0 * 1024.0) / (elapsed_ms / 1000);
+            LOG(INFO) << "PROFILER::KERNEL::CUDA_MEMORY_BANDWIDTH op_name: " << op_name
+                      << " elapsed(ms): " << elapsed_ms << " memory_size(Byte): " << memory_size
+                      << " bandwidth(GB/s): " << bandwidth;
+          });
+    }
+  }
+#endif  // WITH_ROCM
 }

 }  // namespace profiler

--- a/oneflow/core/profiler/kineto_shim.cpp
+++ b/oneflow/core/profiler/kineto_shim.cpp
@@ -14,7 +14,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 */

-#if defined(WITH_CUDA)
+#if defined(WITH_CUDA) || defined(WITH_ROCM)

 #include "oneflow/core/profiler/kineto_shim.h"
 #include "libkineto.h"

--- a/oneflow/core/profiler/kineto_shim.h
+++ b/oneflow/core/profiler/kineto_shim.h
@@ -16,7 +16,7 @@ limitations under the License.
 #ifndef ONEFLOW_CORE_PROFILER_KINETO_SHIM_H_
 #define ONEFLOW_CORE_PROFILER_KINETO_SHIM_H_

-#if defined(WITH_CUDA)
+#if defined(WITH_CUDA) || defined(WITH_ROCM)

 #include <string>
 #include <memory>

--- a/oneflow/core/profiler/profile_manager.cpp
+++ b/oneflow/core/profiler/profile_manager.cpp
@@ -15,12 +15,12 @@ limitations under the License.
 */
 #include <memory>
 #include <unordered_map>
-// #include "fmt/core.h"
+#include "fmt/core.h"
 #include "nlohmann/json.hpp"
 #include "oneflow/core/profiler/kineto_shim.h"
 #include "oneflow/core/profiler/profile_manager.h"
 #include "oneflow/core/profiler/event.h"
-#if defined(WITH_CUDA)
+#if defined(WITH_CUDA) || defined(WITH_ROCM)
 #include <libkineto.h>
 #endif  // WITH_CUDA

@@ -48,7 +48,7 @@ std::string ProfileManager::DumpResultsJson() {
 }

 std::vector<std::shared_ptr<IEvent>> ProfileManager::ExportEvents() {
-#if defined(WITH_CUDA)
+#if defined(WITH_CUDA) || defined(WITH_ROCM)
  auto trace = StopTrace();
  const auto& kineto_events = *(trace.get()->activities());
  std::set<std::shared_ptr<IEvent>> custom_events;
@@ -77,7 +77,7 @@ std::vector<std::shared_ptr<IEvent>> ProfileManager::ExportEvents() {
  while (!events_.empty()) {
    auto evt = events_.front();
    events_.pop();
-#if defined(WITH_CUDA)
+#if defined(WITH_CUDA) || defined(WITH_ROCM)
    auto evt_kernel = std::dynamic_pointer_cast<KernelEvent>(evt);
    if (evt_kernel) {
      std::set<int64_t> current_corr_ids;
@@ -106,8 +106,7 @@ std::string ProfileManager::GetNextEventRecorderKey(const std::string& name) {
  } else {
    event_recorders_last_id_[name]++;
  }
-  // return fmt::format("{}.{}", name, event_recorders_last_id_[name]);
-  return "yuguo";
+  return fmt::format("{}.{}", name, event_recorders_last_id_[name]);
 }

 }  // namespace profiler

--- a/oneflow/core/profiler/profile_manager.h
+++ b/oneflow/core/profiler/profile_manager.h
@@ -37,7 +37,7 @@ class ProfileManager {
        use_cuda_(use_cuda),
        record_shapes_(record_shapes),
        record_bandwidth_(record_bandwidth) {
-#if defined(WITH_CUDA)
+#if defined(WITH_CUDA) || defined(WITH_ROCM)
    std::set<ActivityType> activities{};
    if (use_cpu) { activities.insert(ActivityType::CPU); }
    if (use_cuda) { activities.insert(ActivityType::CUDA); }

--- a/oneflow/core/profiler/profiler.cpp
+++ b/oneflow/core/profiler/profiler.cpp
@@ -20,11 +20,20 @@ limitations under the License.
 #include "oneflow/core/profiler/event_recorder.h"
 #include "oneflow/core/vm/vm_util.h"
 #ifdef OF_ENABLE_PROFILER
+#ifdef WITH_ROCM
+#include <hip/hip_runtime.h>
+#include <hip/hip_profile.h>
+#include <roctracer_roctx.h>
+#include <sys/syscall.h>
+#include <iostream>
+#include "oneflow/core/device/cuda_util.h"
+#else
 #include <nvtx3/nvToolsExt.h>
 #include <sys/syscall.h>
 #include <iostream>
 #include <cuda_profiler_api.h>
 #include "oneflow/core/device/cuda_util.h"
+#endif
 #endif  // OF_ENABLE_PROFILER

 namespace oneflow {
@@ -33,6 +42,16 @@ namespace profiler {

 void NameThisHostThread(const std::string& name) {
 #ifdef OF_ENABLE_PROFILER
+#ifdef WITH_ROCM
+  static thread_local std::unique_ptr<std::string> thread_name_prefix;
+  if (!thread_name_prefix) {
+    thread_name_prefix.reset(
+        new std::string(GetStringFromEnv("ONEFLOW_PROFILER_HOST_THREAD_NAME_PREFIX", "")));
+  }
+  const std::string name_with_prefix = *thread_name_prefix + name;
+  // nvtxNameOsThreadA(syscall(SYS_gettid), name_with_prefix.c_str());
+  roctxMarkA(name_with_prefix.c_str());
+#else
  static thread_local std::unique_ptr<std::string> thread_name_prefix;
  if (!thread_name_prefix) {
    thread_name_prefix.reset(
@@ -40,18 +59,27 @@ void NameThisHostThread(const std::string& name) {
  }
  const std::string name_with_prefix = *thread_name_prefix + name;
  nvtxNameOsThreadA(syscall(SYS_gettid), name_with_prefix.c_str());
+#endif
 #endif  // OF_ENABLE_PROFILER
 }

 void RangePush(const std::string& name) {
 #ifdef OF_ENABLE_PROFILER
+#ifdef WITH_ROCM
+  roctxRangePushA(name.c_str());
+#else
  nvtxRangePushA(name.c_str());
+#endif
 #endif  // OF_ENABLE_PROFILER
 }

 void RangePop() {
 #ifdef OF_ENABLE_PROFILER
+#ifdef WITH_ROCM
+  roctxRangePop();
+#else
  nvtxRangePop();
+#endif
 #endif  // OF_ENABLE_PROFILER
 }

@@ -82,13 +110,21 @@ void LogHostMemoryUsage(const std::string& name) {

 void ProfilerStart() {
 #ifdef OF_ENABLE_PROFILER
+#ifdef WITH_ROCM
+  OF_CUDA_CHECK(hipProfilerStart());
+#else
  OF_CUDA_CHECK(cudaProfilerStart());
+#endif
 #endif  // OF_ENABLE_PROFILER
 }

 void ProfilerStop() {
 #ifdef OF_ENABLE_PROFILER
+#ifdef WITH_ROCM
+  OF_CUDA_CHECK(hipProfilerStop());
+#else
  OF_CUDA_CHECK(cudaProfilerStop());
+#endif
 #endif  // OF_ENABLE_PROFILER
 }

@@ -105,6 +141,9 @@ Maybe<std::string> DisableProfilerAndReturnResult() {
 #if defined(WITH_CUDA)
  OF_CUDA_CHECK(cudaDeviceSynchronize());
 #endif  // WITH_CUDA
+#if defined(WITH_ROCM)
+  OF_CUDA_CHECK(hipDeviceSynchronize());
+#endif  // WITH_ROCM
  auto* pmgr = JUST(SingletonMaybe<ProfileManager>());
  std::string results = pmgr->DumpResultsJson();
  Singleton<ProfileManager>::Delete();

--- a/oneflow/user/kernels/math_unary_elementwise_func.h
+++ b/oneflow/user/kernels/math_unary_elementwise_func.h
@@ -250,7 +250,7 @@ struct LgammaFunctor<float> {

  static OF_DEVICE_FUNC float Backward(const float x, const float dy) {
    // TODO(chengcheng): return: dy * digamma(x)
-    assert(false);
+    // assert(false);
    return 0.0f;
  }
 };
@@ -526,7 +526,7 @@ struct LgammaFunctor<double> {

  static OF_DEVICE_FUNC double Backward(const double x, const double dy) {
    // TODO(chengcheng): return: dy * digamma(x)
-    assert(false);
+    // assert(false);
    return 0.0;
  }
 };
@@ -817,7 +817,7 @@ struct LgammaFunctor<half> {

  static OF_HALF_FUNC half Backward(const half x, const half dy) {
    // TODO(chengcheng): return: dy * digamma(x)
-    assert(false);
+    // assert(false);
    return GetZeroVal<half>();
  }
 };

--- a/oneflow/user/kernels/nvtx_range_kernel.hip.cpp
+++ b/oneflow/user/kernels/nvtx_range_kernel.hip.cpp
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "oneflow/core/framework/framework.h"
+#include "oneflow/core/kernel/new_kernel_util.h"
+
+#ifdef OF_ENABLE_PROFILER
+#include <roctracer_roctx.h>
+#endif  // OF_ENABLE_PROFILER
+
+namespace oneflow {
+
+namespace {
+
+#ifdef OF_ENABLE_PROFILER
+static thread_local HashMap<std::string, roctx_range_id_t> mark2range_id;
+#endif
+
+}  // namespace
+
+class NvtxOpKernelState final : public user_op::OpKernelState {
+ public:
+  NvtxOpKernelState() : counter_(0) {
+#ifndef OF_ENABLE_PROFILER
+    LOG(WARNING) << "To use NVTX, run cmake with -DBUILD_PROFILER=ON";
+#endif
+  }
+  ~NvtxOpKernelState() override = default;
+
+  int64_t counter() const { return counter_; }
+  void IncreaseCount() { counter_ += 1; }
+
+ private:
+  int64_t counter_;
+};
+
+class NvtxStartKernel final : public user_op::OpKernel {
+ public:
+  NvtxStartKernel() = default;
+  ~NvtxStartKernel() override = default;
+
+  std::shared_ptr<user_op::OpKernelState> CreateOpKernelState(
+      user_op::KernelInitContext* ctx) const override {
+    return std::make_shared<NvtxOpKernelState>();
+  }
+
+ private:
+  using user_op::OpKernel::Compute;
+  void Compute(user_op::KernelComputeContext* ctx, user_op::OpKernelState* state,
+               const user_op::OpKernelCache*) const override {
+    const user_op::Tensor* in = ctx->Tensor4ArgNameAndIndex("in", 0);
+    user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0);
+    const ShapeView& in_shape = in->shape_view();
+    CHECK_EQ(out->shape_view(), in_shape);
+    const DataType in_data_type = in->data_type();
+    CHECK_EQ(out->data_type(), in_data_type);
+    Memcpy<DeviceType::kCUDA>(ctx->stream(), out->mut_dptr<void>(), in->dptr<void>(),
+                              in_shape.elem_cnt() * GetSizeOfDataType(in_data_type));
+#ifdef OF_ENABLE_PROFILER
+    auto* kernel_state = dynamic_cast<NvtxOpKernelState*>(state);
+    const std::string mark_prefix = ctx->Attr<std::string>("mark_prefix");
+    const std::string mark = mark_prefix + "-" + std::to_string(kernel_state->counter());
+    roctx_range_id_t range_id = roctxRangeStartA(mark.c_str());
+    CHECK(mark2range_id.emplace(mark, range_id).second);
+    kernel_state->IncreaseCount();
+#endif  // OF_ENABLE_PROFILER
+  }
+  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
+};
+
+REGISTER_USER_KERNEL("nvtx_start")
+    .SetCreateFn<NvtxStartKernel>()
+    .SetIsMatchedHob(user_op::HobDeviceType() == DeviceType::kCUDA)
+    .SetInplaceProposalFn([](const user_op::InferContext&,
+                             user_op::AddInplaceArgPair AddInplaceArgPairFn) -> Maybe<void> {
+      OF_RETURN_IF_ERROR(AddInplaceArgPairFn("out", 0, "in", 0, false));
+      return Maybe<void>::Ok();
+    });
+
+class NvtxEndKernel final : public user_op::OpKernel {
+ public:
+  NvtxEndKernel() = default;
+  ~NvtxEndKernel() override = default;
+
+  std::shared_ptr<user_op::OpKernelState> CreateOpKernelState(
+      user_op::KernelInitContext* ctx) const override {
+    return std::make_shared<NvtxOpKernelState>();
+  }
+
+ private:
+  using user_op::OpKernel::Compute;
+  void Compute(user_op::KernelComputeContext* ctx, user_op::OpKernelState* state,
+               const user_op::OpKernelCache*) const override {
+    const user_op::Tensor* in = ctx->Tensor4ArgNameAndIndex("in", 0);
+    user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0);
+    const ShapeView& in_shape = in->shape_view();
+    CHECK_EQ(out->shape_view(), in_shape);
+    const DataType in_data_type = in->data_type();
+    CHECK_EQ(out->data_type(), in_data_type);
+#ifdef OF_ENABLE_PROFILER
+    auto* kernel_state = dynamic_cast<NvtxOpKernelState*>(state);
+    const std::string mark_prefix = ctx->Attr<std::string>("mark_prefix");
+    const std::string mark = mark_prefix + "-" + std::to_string(kernel_state->counter());
+    auto it = mark2range_id.find(mark.c_str());
+    CHECK(it != mark2range_id.end());
+    roctx_range_id_t range_id = it->second;
+    mark2range_id.erase(it);
+    roctxRangeStop(range_id);
+    Memcpy<DeviceType::kCUDA>(ctx->stream(), out->mut_dptr<void>(), in->dptr<void>(),
+                              in_shape.elem_cnt() * GetSizeOfDataType(in_data_type));
+    kernel_state->IncreaseCount();
+#endif
+  }
+  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
+};
+
+REGISTER_USER_KERNEL("nvtx_end")
+    .SetCreateFn<NvtxEndKernel>()
+    .SetIsMatchedHob(user_op::HobDeviceType() == DeviceType::kCUDA)
+    .SetInplaceProposalFn([](const user_op::InferContext&,
+                             user_op::AddInplaceArgPair AddInplaceArgPairFn) -> Maybe<void> {
+      OF_RETURN_IF_ERROR(AddInplaceArgPairFn("out", 0, "in", 0, false));
+      return Maybe<void>::Ok();
+    });
+
+}  // namespace oneflow
--- a/oneflow/user/kernels/stateful_opkernel.cpp
+++ b/oneflow/user/kernels/stateful_opkernel.cpp
@@ -867,7 +867,7 @@ void StatefulOpKernel::Compute(eager::CallContext* call_ctx, DeviceCtx* device_c
  auto* compute_ctx = &compute_context;
  OF_PROFILER_RANGE_GUARD("Compute");
  if (Singleton<profiler::ProfileManager>::Get()) {
-#if defined(WITH_CUDA)
+#if defined(WITH_CUDA) || defined(WITH_ROCM)
    const auto CalMemorySize = [compute_ctx](const one::ArgVec& args) -> int64_t {
      const auto Func = [compute_ctx](int64_t mem_size, const auto& pair) {
        const auto tensor = compute_ctx->Tensor4ArgNameAndIndex(pair.first, pair.second);
@@ -878,13 +878,13 @@ void StatefulOpKernel::Compute(eager::CallContext* call_ctx, DeviceCtx* device_c
 #endif
    auto er_guard = CHECK_JUST(profiler::EventRecorder::CreateKernelEventRecorder(
        op_type_name(),
-#if defined(WITH_CUDA)
+#if defined(WITH_CUDA) || defined(WITH_ROCM)
        [compute_ctx, CalMemorySize]() -> int64_t {
          return CalMemorySize(compute_ctx->inputs()) + CalMemorySize(compute_ctx->outputs());
        },
 #endif
-        [compute_ctx]() -> std::vector<ShapeView> {
-          std::vector<ShapeView> shapes;
+        [compute_ctx]() -> std::vector<Shape> {
+          std::vector<Shape> shapes;
          for (const auto& pair : compute_ctx->inputs()) {
            shapes.emplace_back(
                compute_ctx->TensorDesc4ArgNameAndIndex(pair.first, pair.second)->shape());

--- a/python/oneflow/test/modules/fused_dot_feature_interaction.py
+++ b/python/oneflow/test/modules/fused_dot_feature_interaction.py
+import numpy as np
+import oneflow as flow
+
+def fused_dot_feature_interaction(x,
+                                  y,
+                                  self_interaction=False,
+                                  output_padding=0,
+                                  output_concat=None,
+                                  dtype=flow.float32
+                                  ):
+    # (bs, es) = x.shape
+    (bs, dims, es) = y.shape
+    
+    if self_interaction:
+        offset = 1
+    else:
+        offset = 0
+    li = flow.tensor([i for i in range(dims + 1) for j in range(i + offset)])
+    lj = flow.tensor([j for i in range(dims + 1) for j in range(i + offset)])
+    T = flow.cat(
+        [
+            flow.reshape(x, (bs, 1, es)),
+            y,
+        ],
+        dim=1,
+    )
+    Z = flow.matmul(T, T, transpose_b=True)
+    # gather_nd not support half, so cast to float32
+    Z = flow.cast(Z, flow.float32)
+    Zflat = Z[:, li, lj]
+    Zflat = flow.cast(Zflat, dtype)
+    if output_concat is not None:
+        R = flow.cat([output_concat, Zflat], dim=1)
+    else:
+        R = Zflat
+    if output_padding != 0:
+        padding_tensor = flow.tensor(
+            np.zeros((bs, output_padding)).astype(np.float32),
+            device="cuda",
+            requires_grad=False,
+        )
+        R = flow.cat([R, padding_tensor], dim=1)
+    return R
--- a/python/oneflow/test/profiler/test_profile_lenet.py
+++ b/python/oneflow/test/profiler/test_profile_lenet.py
@@ -80,7 +80,7 @@ def _test_lenet(
        with oneflow.profiler.record_function("lenet_backward_total_time") as f:
            eager_res.sum().backward()
    events = prof.key_averages(group_by_input_shape=True)
-
+    print(events)
    conv_event = get_event(
        events, "conv2d", "[(2,3,32,32), (6,3,5,5)]" if record_shapes else "-"
    )