Surpport profiler for DCU, surpport debug compiler

f262efc9 · yuguo · 3f56062c · f262efc9 · f262efc9 · f262efc9
Commit f262efc9 authored Nov 21, 2022 by yuguo
17 changed files
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -265,9 +265,9 @@ set(ROBIN_HOOD_HASHING_URL
 use_mirror(VARIABLE ROBIN_HOOD_HASHING_URL URL ${ROBIN_HOOD_HASHING_URL})
 set(ROBIN_HOOD_HASHING_MD5 a78bd30a7582f25984f8592652836467)
-set(FMT_URL https://github.com/fmtlib/fmt/archive/48b7e3dafb27ece02cd6addc8bd1041c79d59c2c.zip)
+set(FMT_URL https://github.com/fmtlib/fmt/archive/fc07217d85e6dcec52878807d6bbd89a9d9156a5.zip)
 use_mirror(VARIABLE FMT_URL URL ${FMT_URL})
-set(FMT_MD5 45925a979ed7195e0c88a70be691de09)
+set(FMT_MD5 7d9bb2ececc9ede29cd35bdc42a7e22c)
 set(KINETO_URL
    https://github.com/pytorch/kineto/archive/ff8dba20499a660650632952be76450bd70a52a6.zip)

--- a/cmake/third_party.cmake
+++ b/cmake/third_party.cmake
@@ -175,6 +175,8 @@ if (BUILD_ROCM)
  add_definitions(-D__HIP_PLATFORM_HCC__)
  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D__HIP_PLATFORM_HCC__ --gpu-max-threads-per-block=1024")
  set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -D__HIP_PLATFORM_HCC__ --gpu-max-threads-per-block=1024")
+  set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -mcmodel=large")
+  set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} -mcmodel=large")
  list(APPEND oneflow_third_party_libs hip::device)
  list(APPEND oneflow_third_party_libs roc::hipblas)
  list(APPEND oneflow_third_party_libs hip::hipcub)

--- a/oneflow/core/profiler/event.cpp
+++ b/oneflow/core/profiler/event.cpp
 /*
 Copyright 2020 The OneFlow Authors. All rights reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 */
-// #include "fmt/core.h"
+#include "fmt/core.h"
-// #include "fmt/format.h"
+#include "fmt/format.h"
 #include "oneflow/core/profiler/event.h"
 #include "oneflow/core/profiler/util.h"
 using json = nlohmann::json;
 namespace oneflow {
 namespace profiler {
 nlohmann::json IEvent::ToJson() {
  return json{{"name", name_}, {"time", GetDuration<double>()}, {"input_shapes", "-"}};
 }
 void IEvent::SetStartedAt(double t) { started_at_ = t; }
 void IEvent::SetFinishedAt(double t) { finished_at_ = t; }
 void IEvent::Start() { SetStartedAt(GetTimeNow()); }
 void IEvent::Finish() { SetFinishedAt(GetTimeNow()); }
 bool IEvent::IsChildOf(const IEvent* e) {
  if (!e) { return false; }
  if (this == e) { return false; }
  return GetStartedAt<double>() >= e->GetStartedAt<double>()
         && GetFinishedAt<double>() <= e->GetFinishedAt<double>();
 }
 const std::string& IEvent::GetName() const { return name_; }
 std::string CustomEvent::Key() { return name_; }
 nlohmann::json CustomEvent::ToJson() {
  auto j = IEvent::ToJson();
  j["type"] = EventType::kCustom;
  j["custom_type"] = type_;
  return j;
 }
 std::shared_ptr<CustomEvent> CustomEvent::Create(const std::string& name, CustomEventType type) {
  return std::shared_ptr<CustomEvent>(new CustomEvent(name, type));
 }
-// std::string KernelEvent::Key() { return fmt::format("{}.{}", name_, GetFormatedInputShapes()); }
+std::string KernelEvent::Key() { return fmt::format("{}.{}", name_, GetFormatedInputShapes()); }
-std::string KernelEvent::Key() { return "yuguo"; }
+nlohmann::json KernelEvent::ToJson() {
-nlohmann::json KernelEvent::ToJson() {
+  auto j = IEvent::ToJson();
-  auto j = IEvent::ToJson();
+  j["type"] = EventType::kOneflowKernel;
-  j["type"] = EventType::kOneflowKernel;
+  j["input_shapes"] = GetFormatedInputShapes();
-  j["input_shapes"] = GetFormatedInputShapes();
+#if defined(WITH_CUDA) || defined(WITH_ROCM)
-#if defined(WITH_CUDA)
+  j["memory_size"] = memory_size_;
-  j["memory_size"] = memory_size_;
+  if (!children_.empty()) { j["children"] = children_; }
-  if (!children_.empty()) { j["children"] = children_; }
+#endif  // WITH_CUDA
-#endif  // WITH_CUDA
+  return j;
-  return j;
+}
-}
+std::shared_ptr<KernelEvent> KernelEvent::Create(
-std::shared_ptr<KernelEvent> KernelEvent::Create(
+    const std::string& name, const std::function<std::vector<Shape>(void)>& shape_getter) {
-    const std::string& name, const std::function<std::vector<ShapeView>(void)>& shape_getter) {
+  return std::shared_ptr<KernelEvent>(new KernelEvent(name, shape_getter));
-  return std::shared_ptr<KernelEvent>(new KernelEvent(name, shape_getter));
+}
-}
+std::string KernelEvent::GetFormatedInputShapes(size_t max_num_to_format) {
-void KernelEvent::RecordShape(const ShapeView& shape) { input_shapes_.emplace_back(shape); }
+  if (input_shapes_.size() == 0) { return "-"; }
+  std::vector<std::string> shapes_formated(std::min(input_shapes_.size(), max_num_to_format));
-std::string KernelEvent::GetFormatedInputShapes(size_t max_num_to_format) {
+  for (auto i = 0; i < shapes_formated.size(); ++i) {
-  if (input_shapes_.size() == 0) { return "-"; }
+    const std::string current_shape = input_shapes_[i].ToString();
-  std::vector<std::string> shapes_formated(std::min(input_shapes_.size(), max_num_to_format));
+    shapes_formated[i] = current_shape == "()" ? "scalar" : current_shape;
-  for (auto i = 0; i < shapes_formated.size(); ++i) {
+  }
-    const std::string current_shape = input_shapes_[i].ToString();
+  if (input_shapes_.size() > max_num_to_format) { shapes_formated.emplace_back("..."); }
-    shapes_formated[i] = current_shape == "()" ? "scalar" : current_shape;
+  return fmt::format("[{}]", fmt::join(shapes_formated, ", "));
-  }
+}
-  if (input_shapes_.size() > max_num_to_format) { shapes_formated.emplace_back("..."); }
-  // return fmt::format("[{}]", fmt::join(shapes_formated, ", "));
+}  // namespace profiler
-  return "yuguo";
-}
-}  // namespace profiler
 }  // namespace oneflow
\ No newline at end of file
--- a/oneflow/core/profiler/event.h
+++ b/oneflow/core/profiler/event.h
 /*
 Copyright 2020 The OneFlow Authors. All rights reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 */
 #ifndef ONEFLOW_CORE_PROFILER_EVENT_H_
 #define ONEFLOW_CORE_PROFILER_EVENT_H_
 #include <functional>
 #include <memory>
 #include <vector>
 #include "nlohmann/json.hpp"
 #include "oneflow/core/common/util.h"
 #include "oneflow/core/common/shape_view.h"
 namespace oneflow {
 namespace profiler {
 class ProfileManager;
 enum class EventType {
  kCustom,        // has three kinds
  kOneflowKernel  // OneFlow cpu/cuda kernel
 };
 enum class CustomEventType {
  kDefault,     // for record_function
  kCudaKernel,  // cuda kernel
  kCudaRuntime  // something like cudaLaunchKernel
 };
 enum class EventTimeUnit { kNS, kUS };
 class IEvent {
 public:
  OF_DISALLOW_COPY_AND_MOVE(IEvent);
  IEvent() = delete;
  IEvent(const std::string& name, EventTimeUnit time_unit) : name_(name), time_unit_(time_unit) {}
  virtual std::string Key() = 0;
  virtual nlohmann::json ToJson();
  virtual ~IEvent() = default;
  virtual void Start();
  virtual void Finish();
  bool IsChildOf(const IEvent* e);
  const std::string& GetName() const;
  template<typename T>
  const T GetDuration(EventTimeUnit time_unit = EventTimeUnit::kUS) const;
  template<typename T>
  const T GetStartedAt(EventTimeUnit time_unit = EventTimeUnit::kUS) const;
  template<typename T>
  const T GetFinishedAt(EventTimeUnit time_unit = EventTimeUnit::kUS) const;
 protected:
  virtual void SetStartedAt(double t);
  virtual void SetFinishedAt(double t);
  std::string name_;
  EventTimeUnit time_unit_;
  double started_at_ = 0;
  double finished_at_ = 0;
 };
 inline double ConvertTime(double time_, EventTimeUnit src_time_unit, EventTimeUnit dst_time_unit) {
  if (src_time_unit == EventTimeUnit::kNS && dst_time_unit == EventTimeUnit::kUS) {
    return time_ / 1000;
  }
  if (src_time_unit == EventTimeUnit::kUS && dst_time_unit == EventTimeUnit::kNS) {
    return time_ * 1000;
  }
  return time_;
 }
 template<>
 const inline double IEvent::GetStartedAt<double>(EventTimeUnit time_unit) const {
  return ConvertTime(started_at_, time_unit_, time_unit);
 }
 template<>
 const inline time_t IEvent::GetStartedAt<time_t>(EventTimeUnit time_unit) const {
  return static_cast<time_t>(GetStartedAt<double>(time_unit));
 }
 template<>
 const inline double IEvent::GetFinishedAt<double>(EventTimeUnit time_unit) const {
  return ConvertTime(finished_at_, time_unit_, time_unit);
 }
 template<>
 const inline time_t IEvent::GetFinishedAt<time_t>(EventTimeUnit time_unit) const {
  return static_cast<time_t>(GetFinishedAt<double>(time_unit));
 }
 template<>
 const inline double IEvent::GetDuration<double>(EventTimeUnit time_unit) const {
  return GetFinishedAt<double>(time_unit) - GetStartedAt<double>(time_unit);
 }
 template<>
 const inline time_t IEvent::GetDuration<time_t>(EventTimeUnit time_unit) const {
  return static_cast<time_t>(GetDuration<double>(time_unit));
 }
 class CustomEvent final : public IEvent {
 public:
  friend class ProfileManager;
  std::string Key() override;
  nlohmann::json ToJson() override;
  static std::shared_ptr<CustomEvent> Create(const std::string& name,
                                             CustomEventType type = CustomEventType::kDefault);
 private:
  CustomEventType type_;
  CustomEvent(const std::string& custom_name, CustomEventType type)
      : IEvent(custom_name,
               type == CustomEventType::kDefault ? EventTimeUnit::kNS : EventTimeUnit::kUS),
        type_(type) {}
 };
 class KernelEvent final : public IEvent {
 public:
  std::string Key() override;
  nlohmann::json ToJson() override;
  static std::shared_ptr<KernelEvent> Create(
-      const std::string& name, const std::function<std::vector<ShapeView>(void)>& shape_getter);
+      const std::string& name, const std::function<std::vector<Shape>(void)>& shape_getter);
-  void RecordShape(const ShapeView& shape);
+#if defined(WITH_CUDA) || defined(WITH_ROCM)
+  void SetMemorySize(int64_t memory_size) { memory_size_ = memory_size; }
-#if defined(WITH_CUDA)
+  void AddChildEvent(const std::shared_ptr<IEvent>& e) { children_.emplace(e); }
-  void SetMemorySize(int64_t memory_size) { memory_size_ = memory_size; }
+  bool AddChildEventIfSo(const std::shared_ptr<IEvent>& e) {
-  void AddChildEvent(const std::shared_ptr<IEvent>& e) { children_.emplace(e); }
+    if (e->IsChildOf(dynamic_cast<IEvent*>(this))) {
-  bool AddChildEventIfSo(const std::shared_ptr<IEvent>& e) {
+      children_.emplace(e);
-    if (e->IsChildOf(dynamic_cast<IEvent*>(this))) {
+      return true;
-      children_.emplace(e);
+    }
-      return true;
+    return false;
-    }
+  }
-    return false;
+  bool HasChildEvent(const std::shared_ptr<IEvent>& e) { return children_.count(e); }
-  }
+  void WalkAmongChildren(const std::function<void(const std::shared_ptr<IEvent>& e)>& f) const {
-  bool HasChildEvent(const std::shared_ptr<IEvent>& e) { return children_.count(e); }
+    for (const auto& x : children_) { f(x); }
-  void WalkAmongChildren(const std::function<void(const std::shared_ptr<IEvent>& e)>& f) const {
+  }
-    for (const auto& x : children_) { f(x); }
+#endif  // WITH_CUDA
-  }
-#endif  // WITH_CUDA
+ private:
+  KernelEvent(const std::string& kernel_name,
- private:
+              const std::function<std::vector<Shape>(void)>& shape_getter)
-  KernelEvent(const std::string& kernel_name,
+      : IEvent(kernel_name, EventTimeUnit::kNS) {
-              const std::function<std::vector<ShapeView>(void)>& shape_getter)
+    if (shape_getter) { input_shapes_ = shape_getter(); }
-      : IEvent(kernel_name, EventTimeUnit::kNS) {
+  }
-    if (shape_getter) { input_shapes_ = shape_getter(); }
-  }
+#if defined(WITH_CUDA) || defined(WITH_ROCM)
+  int64_t memory_size_ = -1;
-#if defined(WITH_CUDA)
+  std::set<std::shared_ptr<IEvent>> children_;
-  int64_t memory_size_ = -1;
+#endif  // WITH_CUDA
-  std::set<std::shared_ptr<IEvent>> children_;
-#endif  // WITH_CUDA
+  std::vector<Shape> input_shapes_;
+  std::string GetFormatedInputShapes(size_t max_num_to_format = 4);
-  std::vector<ShapeView> input_shapes_;
+};
-  std::string GetFormatedInputShapes(size_t max_num_to_format = 4);
-};
+}  // namespace profiler
+}  // namespace oneflow
-}  // namespace profiler
-}  // namespace oneflow
+namespace nlohmann {
-namespace nlohmann {
+inline void to_json(json& j, const std::shared_ptr<::oneflow::profiler::IEvent>& event) {
+  j = event->ToJson();
-inline void to_json(json& j, const std::shared_ptr<::oneflow::profiler::IEvent>& event) {
+}
-  j = event->ToJson();
-}
+}  // namespace nlohmann
-}  // namespace nlohmann
+#endif  // ONEFLOW_CORE_PROFILER_EVENT_H_
-#endif  // ONEFLOW_CORE_PROFILER_EVENT_H_
--- a/oneflow/core/profiler/event_recorder.cpp
+++ b/oneflow/core/profiler/event_recorder.cpp
@@ -32,13 +32,13 @@ std::shared_ptr<EventRecorder> EventRecorder::CreateCustomEventRecorder(const st
 Maybe<EventRecorder> EventRecorder::CreateKernelEventRecorder(
    const std::string& name,
-#if defined(WITH_CUDA)
+#if defined(WITH_CUDA) || defined(WITH_ROCM)
    const std::function<int64_t()>& memory_size_getter,
 #endif
    const ShapeGetterFuncType& shape_getter) {
  auto pmgr = Singleton<ProfileManager>::Get();
  if (pmgr) {
-#if defined(WITH_CUDA)
+#if defined(WITH_CUDA) || defined(WITH_ROCM)
    if (pmgr->use_cpu_ || pmgr->use_cuda_) {
      auto event = KernelEvent::Create(name, pmgr->record_shapes_ ? shape_getter : nullptr);
      if (pmgr->use_cuda_) {

--- a/oneflow/core/profiler/event_recorder.h
+++ b/oneflow/core/profiler/event_recorder.h
 /*
 Copyright 2020 The OneFlow Authors. All rights reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 */
 #ifndef ONEFLOW_CORE_PROFILER_EVENT_RECORDER_H_
 #define ONEFLOW_CORE_PROFILER_EVENT_RECORDER_H_
 #include "oneflow/core/common/util.h"
 #include "oneflow/core/profiler/event.h"
 namespace oneflow {
 namespace profiler {
 class EventRecorder {
 public:
-  using ShapeGetterFuncType = std::function<std::vector<ShapeView>(void)>;
+  using ShapeGetterFuncType = std::function<std::vector<Shape>(void)>;
  OF_DISALLOW_COPY_AND_MOVE(EventRecorder);
  explicit EventRecorder(const std::shared_ptr<IEvent>& event) : event_(event) {
    CHECK_JUST(RegisterEventToProfileManager(event));
    event_->Start();
  }
  Maybe<void> RegisterEventToProfileManager(const std::shared_ptr<IEvent>& event);
  ~EventRecorder() {
    if (event_) {
      event_->Finish();
      event_.reset();
    }
  }
  static std::shared_ptr<EventRecorder> CreateCustomEventRecorder(const std::string& name);
  static Maybe<EventRecorder> CreateKernelEventRecorder(
      const std::string& name,
-#if defined(WITH_CUDA)
+#if defined(WITH_CUDA) || defined(WITH_ROCM)
      const std::function<int64_t()>& memory_size_getter,
 #endif
      const ShapeGetterFuncType& shape_getter);
 private:
  std::shared_ptr<IEvent> event_;
 };
 }  // namespace profiler
 }  // namespace oneflow
 #endif  // ONEFLOW_CORE_PROFILER_EVENT_RECORDER_H_
--- a/oneflow/core/profiler/kernel.cpp
+++ b/oneflow/core/profiler/kernel.cpp
@@ -17,7 +17,11 @@ limitations under the License.
 #include "oneflow/core/profiler/kernel.h"
 #include "oneflow/core/profiler/profiler.h"
 #include "oneflow/core/kernel/kernel.h"
+#ifdef WITH_ROCM
+#include "oneflow/core/ep/rocm/cuda_stream.h"
+#else
 #include "oneflow/core/ep/cuda/cuda_stream.h"
+#endif
 #include "oneflow/core/lazy/actor/actor_context.h"
 namespace oneflow {
@@ -43,6 +47,11 @@ thread_local cudaEvent_t cuda_memory_bandwidth_profile_start_event = nullptr;
 thread_local cudaEvent_t cuda_memory_bandwidth_profile_end_event = nullptr;
 #endif  // WITH_CUDA
+#if defined(WITH_ROCM)
+thread_local hipEvent_t cuda_memory_bandwidth_profile_start_event = nullptr;
+thread_local hipEvent_t cuda_memory_bandwidth_profile_end_event = nullptr;
+#endif  // WITH_ROCM
 }  // namespace
 void TraceKernelForwardDataContentStart(KernelContext* kernel_ctx, const Kernel* kernel) {
@@ -61,6 +70,22 @@ void TraceKernelForwardDataContentStart(KernelContext* kernel_ctx, const Kernel*
  }
  if (profile_kernel_forward_range) { OF_PROFILER_RANGE_PUSH(kernel->op_conf().name()); }
 #endif  // WITH_CUDA
+#if defined(WITH_ROCM)
+  if (profile_cuda_memory_bandwidth) {
+    auto* actor_context_provider = dynamic_cast<ActorContextProvider*>(kernel_ctx);
+    auto* cuda_stream = dynamic_cast<ep::CudaStream*>(kernel_ctx->stream());
+    if (cuda_stream != nullptr && actor_context_provider != nullptr) {
+      CHECK(cuda_memory_bandwidth_profile_start_event == nullptr);
+      CHECK(cuda_memory_bandwidth_profile_end_event == nullptr);
+      OF_CUDA_CHECK(hipEventCreate(&cuda_memory_bandwidth_profile_start_event));
+      OF_CUDA_CHECK(hipEventCreate(&cuda_memory_bandwidth_profile_end_event));
+      OF_CUDA_CHECK(
+          hipEventRecord(cuda_memory_bandwidth_profile_start_event, cuda_stream->cuda_stream()));
+    }
+  }
+  if (profile_kernel_forward_range) { OF_PROFILER_RANGE_PUSH(kernel->op_conf().name()); }
+#endif  // WITH_ROCM
 }
 void TraceKernelForwardDataContentEnd(KernelContext* kernel_ctx, const Kernel* kernel) {
@@ -103,6 +128,45 @@ void TraceKernelForwardDataContentEnd(KernelContext* kernel_ctx, const Kernel* k
    }
  }
 #endif  // WITH_CUDA
+#if defined(WITH_ROCM)
+  if (profile_kernel_forward_range) { OF_PROFILER_RANGE_POP(); }
+  // The memory bandwidth profiler only works in lazy mode.
+  if (profile_cuda_memory_bandwidth) {
+    auto* cuda_stream = dynamic_cast<ep::CudaStream*>(kernel_ctx->stream());
+    auto* actor_context_provider = dynamic_cast<ActorContextProvider*>(kernel_ctx);
+    if (cuda_stream != nullptr && actor_context_provider != nullptr) {
+      hipEvent_t start_event = cuda_memory_bandwidth_profile_start_event;
+      hipEvent_t end_event = cuda_memory_bandwidth_profile_end_event;
+      cuda_memory_bandwidth_profile_start_event = nullptr;
+      cuda_memory_bandwidth_profile_end_event = nullptr;
+      CHECK_NOTNULL(start_event);
+      CHECK_NOTNULL(end_event);
+      OF_CUDA_CHECK(hipEventRecord(end_event, cuda_stream->cuda_stream()));
+      int64_t memory_size = 0;
+      for (const auto& bn : kernel->op_attribute().input_bns()) {
+        const Blob* blob = kernel_ctx->BnInOp2Blob(bn);
+        if (blob) { memory_size += blob->ByteSizeOfBlobBody(); }
+      }
+      for (const auto& bn : kernel->op_attribute().output_bns()) {
+        const Blob* blob = kernel_ctx->BnInOp2Blob(bn);
+        if (blob) { memory_size += blob->ByteSizeOfBlobBody(); }
+      }
+      const std::string op_name = kernel->op_conf().name();
+      actor_context_provider->GetActorContext()->AddCallback(
+          [start_event, end_event, memory_size, op_name]() {
+            float elapsed_ms = 0;
+            OF_CUDA_CHECK(hipEventElapsedTime(&elapsed_ms, start_event, end_event));
+            OF_CUDA_CHECK(hipEventDestroy(start_event));
+            OF_CUDA_CHECK(hipEventDestroy(end_event));
+            double bandwidth =
+                static_cast<double>(memory_size) / (1024.0 * 1024.0 * 1024.0) / (elapsed_ms / 1000);
+            LOG(INFO) << "PROFILER::KERNEL::CUDA_MEMORY_BANDWIDTH op_name: " << op_name
+                      << " elapsed(ms): " << elapsed_ms << " memory_size(Byte): " << memory_size
+                      << " bandwidth(GB/s): " << bandwidth;
+          });
+    }
+  }
+#endif  // WITH_ROCM
 }
 }  // namespace profiler

--- a/oneflow/core/profiler/kineto_shim.cpp
+++ b/oneflow/core/profiler/kineto_shim.cpp
@@ -14,7 +14,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 */
-#if defined(WITH_CUDA)
+#if defined(WITH_CUDA) || defined(WITH_ROCM)
 #include "oneflow/core/profiler/kineto_shim.h"
 #include "libkineto.h"

--- a/oneflow/core/profiler/kineto_shim.h
+++ b/oneflow/core/profiler/kineto_shim.h
@@ -16,7 +16,7 @@ limitations under the License.
 #ifndef ONEFLOW_CORE_PROFILER_KINETO_SHIM_H_
 #define ONEFLOW_CORE_PROFILER_KINETO_SHIM_H_
-#if defined(WITH_CUDA)
+#if defined(WITH_CUDA) || defined(WITH_ROCM)
 #include <string>
 #include <memory>

--- a/oneflow/core/profiler/profile_manager.cpp
+++ b/oneflow/core/profiler/profile_manager.cpp
@@ -15,12 +15,12 @@ limitations under the License.
 */
 #include <memory>
 #include <unordered_map>
-// #include "fmt/core.h"
+#include "fmt/core.h"
 #include "nlohmann/json.hpp"
 #include "oneflow/core/profiler/kineto_shim.h"
 #include "oneflow/core/profiler/profile_manager.h"
 #include "oneflow/core/profiler/event.h"
-#if defined(WITH_CUDA)
+#if defined(WITH_CUDA) || defined(WITH_ROCM)
 #include <libkineto.h>
 #endif  // WITH_CUDA
@@ -48,7 +48,7 @@ std::string ProfileManager::DumpResultsJson() {
 }
 std::vector<std::shared_ptr<IEvent>> ProfileManager::ExportEvents() {
-#if defined(WITH_CUDA)
+#if defined(WITH_CUDA) || defined(WITH_ROCM)
  auto trace = StopTrace();
  const auto& kineto_events = *(trace.get()->activities());
  std::set<std::shared_ptr<IEvent>> custom_events;
@@ -77,7 +77,7 @@ std::vector<std::shared_ptr<IEvent>> ProfileManager::ExportEvents() {
  while (!events_.empty()) {
    auto evt = events_.front();
    events_.pop();
-#if defined(WITH_CUDA)
+#if defined(WITH_CUDA) || defined(WITH_ROCM)
    auto evt_kernel = std::dynamic_pointer_cast<KernelEvent>(evt);
    if (evt_kernel) {
      std::set<int64_t> current_corr_ids;
@@ -106,8 +106,7 @@ std::string ProfileManager::GetNextEventRecorderKey(const std::string& name) {
  } else {
    event_recorders_last_id_[name]++;
  }
-  // return fmt::format("{}.{}", name, event_recorders_last_id_[name]);
+  return fmt::format("{}.{}", name, event_recorders_last_id_[name]);
-  return "yuguo";
 }
 }  // namespace profiler

--- a/oneflow/core/profiler/profile_manager.h
+++ b/oneflow/core/profiler/profile_manager.h
@@ -37,7 +37,7 @@ class ProfileManager {
        use_cuda_(use_cuda),
        record_shapes_(record_shapes),
        record_bandwidth_(record_bandwidth) {
-#if defined(WITH_CUDA)
+#if defined(WITH_CUDA) || defined(WITH_ROCM)
    std::set<ActivityType> activities{};
    if (use_cpu) { activities.insert(ActivityType::CPU); }
    if (use_cuda) { activities.insert(ActivityType::CUDA); }

--- a/oneflow/core/profiler/profiler.cpp
+++ b/oneflow/core/profiler/profiler.cpp
@@ -20,11 +20,20 @@ limitations under the License.
 #include "oneflow/core/profiler/event_recorder.h"
 #include "oneflow/core/vm/vm_util.h"
 #ifdef OF_ENABLE_PROFILER
+#ifdef WITH_ROCM
+#include <hip/hip_runtime.h>
+#include <hip/hip_profile.h>
+#include <roctracer_roctx.h>
+#include <sys/syscall.h>
+#include <iostream>
+#include "oneflow/core/device/cuda_util.h"
+#else
 #include <nvtx3/nvToolsExt.h>
 #include <sys/syscall.h>
 #include <iostream>
 #include <cuda_profiler_api.h>
 #include "oneflow/core/device/cuda_util.h"
+#endif
 #endif  // OF_ENABLE_PROFILER
 namespace oneflow {
@@ -33,6 +42,16 @@ namespace profiler {
 void NameThisHostThread(const std::string& name) {
 #ifdef OF_ENABLE_PROFILER
+#ifdef WITH_ROCM
+  static thread_local std::unique_ptr<std::string> thread_name_prefix;
+  if (!thread_name_prefix) {
+    thread_name_prefix.reset(
+        new std::string(GetStringFromEnv("ONEFLOW_PROFILER_HOST_THREAD_NAME_PREFIX", "")));
+  }
+  const std::string name_with_prefix = *thread_name_prefix + name;
+  // nvtxNameOsThreadA(syscall(SYS_gettid), name_with_prefix.c_str());
+  roctxMarkA(name_with_prefix.c_str());
+#else
  static thread_local std::unique_ptr<std::string> thread_name_prefix;
  if (!thread_name_prefix) {
    thread_name_prefix.reset(
@@ -40,18 +59,27 @@ void NameThisHostThread(const std::string& name) {
  }
  const std::string name_with_prefix = *thread_name_prefix + name;
  nvtxNameOsThreadA(syscall(SYS_gettid), name_with_prefix.c_str());
+#endif
 #endif  // OF_ENABLE_PROFILER
 }
 void RangePush(const std::string& name) {
 #ifdef OF_ENABLE_PROFILER
+#ifdef WITH_ROCM
+  roctxRangePushA(name.c_str());
+#else
  nvtxRangePushA(name.c_str());
+#endif
 #endif  // OF_ENABLE_PROFILER
 }
 void RangePop() {
 #ifdef OF_ENABLE_PROFILER
+#ifdef WITH_ROCM
+  roctxRangePop();
+#else
  nvtxRangePop();
+#endif
 #endif  // OF_ENABLE_PROFILER
 }
@@ -82,13 +110,21 @@ void LogHostMemoryUsage(const std::string& name) {
 void ProfilerStart() {
 #ifdef OF_ENABLE_PROFILER
+#ifdef WITH_ROCM
+  OF_CUDA_CHECK(hipProfilerStart());
+#else
  OF_CUDA_CHECK(cudaProfilerStart());
+#endif
 #endif  // OF_ENABLE_PROFILER
 }
 void ProfilerStop() {
 #ifdef OF_ENABLE_PROFILER
+#ifdef WITH_ROCM
+  OF_CUDA_CHECK(hipProfilerStop());
+#else
  OF_CUDA_CHECK(cudaProfilerStop());
+#endif
 #endif  // OF_ENABLE_PROFILER
 }
@@ -105,6 +141,9 @@ Maybe<std::string> DisableProfilerAndReturnResult() {
 #if defined(WITH_CUDA)
  OF_CUDA_CHECK(cudaDeviceSynchronize());
 #endif  // WITH_CUDA
+#if defined(WITH_ROCM)
+  OF_CUDA_CHECK(hipDeviceSynchronize());
+#endif  // WITH_ROCM
  auto* pmgr = JUST(SingletonMaybe<ProfileManager>());
  std::string results = pmgr->DumpResultsJson();
  Singleton<ProfileManager>::Delete();

--- a/oneflow/user/kernels/math_unary_elementwise_func.h
+++ b/oneflow/user/kernels/math_unary_elementwise_func.h
--- a/oneflow/user/kernels/nvtx_range_kernel.hip.cpp
+++ b/oneflow/user/kernels/nvtx_range_kernel.hip.cpp
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "oneflow/core/framework/framework.h"
+#include "oneflow/core/kernel/new_kernel_util.h"
+#ifdef OF_ENABLE_PROFILER
+#include <roctracer_roctx.h>
+#endif  // OF_ENABLE_PROFILER
+namespace oneflow {
+namespace {
+#ifdef OF_ENABLE_PROFILER
+static thread_local HashMap<std::string, roctx_range_id_t> mark2range_id;
+#endif
+}  // namespace
+class NvtxOpKernelState final : public user_op::OpKernelState {
+ public:
+  NvtxOpKernelState() : counter_(0) {
+#ifndef OF_ENABLE_PROFILER
+    LOG(WARNING) << "To use NVTX, run cmake with -DBUILD_PROFILER=ON";
+#endif
+  }
+  ~NvtxOpKernelState() override = default;
+  int64_t counter() const { return counter_; }
+  void IncreaseCount() { counter_ += 1; }
+ private:
+  int64_t counter_;
+};
+class NvtxStartKernel final : public user_op::OpKernel {
+ public:
+  NvtxStartKernel() = default;
+  ~NvtxStartKernel() override = default;
+  std::shared_ptr<user_op::OpKernelState> CreateOpKernelState(
+      user_op::KernelInitContext* ctx) const override {
+    return std::make_shared<NvtxOpKernelState>();
+  }
+ private:
+  using user_op::OpKernel::Compute;
+  void Compute(user_op::KernelComputeContext* ctx, user_op::OpKernelState* state,
+               const user_op::OpKernelCache*) const override {
+    const user_op::Tensor* in = ctx->Tensor4ArgNameAndIndex("in", 0);
+    user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0);
+    const ShapeView& in_shape = in->shape_view();
+    CHECK_EQ(out->shape_view(), in_shape);
+    const DataType in_data_type = in->data_type();
+    CHECK_EQ(out->data_type(), in_data_type);
+    Memcpy<DeviceType::kCUDA>(ctx->stream(), out->mut_dptr<void>(), in->dptr<void>(),
+                              in_shape.elem_cnt() * GetSizeOfDataType(in_data_type));
+#ifdef OF_ENABLE_PROFILER
+    auto* kernel_state = dynamic_cast<NvtxOpKernelState*>(state);
+    const std::string mark_prefix = ctx->Attr<std::string>("mark_prefix");
+    const std::string mark = mark_prefix + "-" + std::to_string(kernel_state->counter());
+    roctx_range_id_t range_id = roctxRangeStartA(mark.c_str());
+    CHECK(mark2range_id.emplace(mark, range_id).second);
+    kernel_state->IncreaseCount();
+#endif  // OF_ENABLE_PROFILER
+  }
+  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
+};
+REGISTER_USER_KERNEL("nvtx_start")
+    .SetCreateFn<NvtxStartKernel>()
+    .SetIsMatchedHob(user_op::HobDeviceType() == DeviceType::kCUDA)
+    .SetInplaceProposalFn([](const user_op::InferContext&,
+                             user_op::AddInplaceArgPair AddInplaceArgPairFn) -> Maybe<void> {
+      OF_RETURN_IF_ERROR(AddInplaceArgPairFn("out", 0, "in", 0, false));
+      return Maybe<void>::Ok();
+    });
+class NvtxEndKernel final : public user_op::OpKernel {
+ public:
+  NvtxEndKernel() = default;
+  ~NvtxEndKernel() override = default;
+  std::shared_ptr<user_op::OpKernelState> CreateOpKernelState(
+      user_op::KernelInitContext* ctx) const override {
+    return std::make_shared<NvtxOpKernelState>();
+  }
+ private:
+  using user_op::OpKernel::Compute;
+  void Compute(user_op::KernelComputeContext* ctx, user_op::OpKernelState* state,
+               const user_op::OpKernelCache*) const override {
+    const user_op::Tensor* in = ctx->Tensor4ArgNameAndIndex("in", 0);
+    user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0);
+    const ShapeView& in_shape = in->shape_view();
+    CHECK_EQ(out->shape_view(), in_shape);
+    const DataType in_data_type = in->data_type();
+    CHECK_EQ(out->data_type(), in_data_type);
+#ifdef OF_ENABLE_PROFILER
+    auto* kernel_state = dynamic_cast<NvtxOpKernelState*>(state);
+    const std::string mark_prefix = ctx->Attr<std::string>("mark_prefix");
+    const std::string mark = mark_prefix + "-" + std::to_string(kernel_state->counter());
+    auto it = mark2range_id.find(mark.c_str());
+    CHECK(it != mark2range_id.end());
+    roctx_range_id_t range_id = it->second;
+    mark2range_id.erase(it);
+    roctxRangeStop(range_id);
+    Memcpy<DeviceType::kCUDA>(ctx->stream(), out->mut_dptr<void>(), in->dptr<void>(),
+                              in_shape.elem_cnt() * GetSizeOfDataType(in_data_type));
+    kernel_state->IncreaseCount();
+#endif
+  }
+  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
+};
+REGISTER_USER_KERNEL("nvtx_end")
+    .SetCreateFn<NvtxEndKernel>()
+    .SetIsMatchedHob(user_op::HobDeviceType() == DeviceType::kCUDA)
+    .SetInplaceProposalFn([](const user_op::InferContext&,
+                             user_op::AddInplaceArgPair AddInplaceArgPairFn) -> Maybe<void> {
+      OF_RETURN_IF_ERROR(AddInplaceArgPairFn("out", 0, "in", 0, false));
+      return Maybe<void>::Ok();
+    });
+}  // namespace oneflow
--- a/oneflow/user/kernels/stateful_opkernel.cpp
+++ b/oneflow/user/kernels/stateful_opkernel.cpp
--- a/python/oneflow/test/modules/fused_dot_feature_interaction.py
+++ b/python/oneflow/test/modules/fused_dot_feature_interaction.py
+import numpy as np
+import oneflow as flow
+def fused_dot_feature_interaction(x,
+                                  y,
+                                  self_interaction=False,
+                                  output_padding=0,
+                                  output_concat=None,
+                                  dtype=flow.float32
+                                  ):
+    # (bs, es) = x.shape
+    (bs, dims, es) = y.shape
+    if self_interaction:
+        offset = 1
+    else:
+        offset = 0
+    li = flow.tensor([i for i in range(dims + 1) for j in range(i + offset)])
+    lj = flow.tensor([j for i in range(dims + 1) for j in range(i + offset)])
+    T = flow.cat(
+        [
+            flow.reshape(x, (bs, 1, es)),
+            y,
+        ],
+        dim=1,
+    )
+    Z = flow.matmul(T, T, transpose_b=True)
+    # gather_nd not support half, so cast to float32
+    Z = flow.cast(Z, flow.float32)
+    Zflat = Z[:, li, lj]
+    Zflat = flow.cast(Zflat, dtype)
+    if output_concat is not None:
+        R = flow.cat([output_concat, Zflat], dim=1)
+    else:
+        R = Zflat
+    if output_padding != 0:
+        padding_tensor = flow.tensor(
+            np.zeros((bs, output_padding)).astype(np.float32),
+            device="cuda",
+            requires_grad=False,
+        )
+        R = flow.cat([R, padding_tensor], dim=1)
+    return R
--- a/python/oneflow/test/profiler/test_profile_lenet.py
+++ b/python/oneflow/test/profiler/test_profile_lenet.py