Surpport profiler for DCU, surpport debug compiler

f262efc9 · yuguo · 3f56062c · f262efc9 · f262efc9 · f262efc9
Commit f262efc9 authored Nov 21, 2022 by yuguo
17 changed files
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -265,9 +265,9 @@ set(ROBIN_HOOD_HASHING_URL
 use_mirror(VARIABLE ROBIN_HOOD_HASHING_URL URL ${ROBIN_HOOD_HASHING_URL})
 set(ROBIN_HOOD_HASHING_MD5 a78bd30a7582f25984f8592652836467)

-set(FMT_URL https://github.com/fmtlib/fmt/archive/48b7e3dafb27ece02cd6addc8bd1041c79d59c2c.zip)
+set(FMT_URL https://github.com/fmtlib/fmt/archive/fc07217d85e6dcec52878807d6bbd89a9d9156a5.zip)
 use_mirror(VARIABLE FMT_URL URL ${FMT_URL})
-set(FMT_MD5 45925a979ed7195e0c88a70be691de09)
+set(FMT_MD5 7d9bb2ececc9ede29cd35bdc42a7e22c)

 set(KINETO_URL
    https://github.com/pytorch/kineto/archive/ff8dba20499a660650632952be76450bd70a52a6.zip)

--- a/cmake/third_party.cmake
+++ b/cmake/third_party.cmake
@@ -175,6 +175,8 @@ if (BUILD_ROCM)
  add_definitions(-D__HIP_PLATFORM_HCC__)
  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D__HIP_PLATFORM_HCC__ --gpu-max-threads-per-block=1024")
  set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -D__HIP_PLATFORM_HCC__ --gpu-max-threads-per-block=1024")
+  set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -mcmodel=large")
+  set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} -mcmodel=large")
  list(APPEND oneflow_third_party_libs hip::device)
  list(APPEND oneflow_third_party_libs roc::hipblas)
  list(APPEND oneflow_third_party_libs hip::hipcub)

--- a/oneflow/core/profiler/event.cpp
+++ b/oneflow/core/profiler/event.cpp
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-// #include "fmt/core.h"
-// #include "fmt/format.h"
-#include "oneflow/core/profiler/event.h"
-#include "oneflow/core/profiler/util.h"
-
-using json = nlohmann::json;
-
-namespace oneflow {
-
-namespace profiler {
-nlohmann::json IEvent::ToJson() {
-  return json{{"name", name_}, {"time", GetDuration<double>()}, {"input_shapes", "-"}};
-}
-
-void IEvent::SetStartedAt(double t) { started_at_ = t; }
-
-void IEvent::SetFinishedAt(double t) { finished_at_ = t; }
-
-void IEvent::Start() { SetStartedAt(GetTimeNow()); }
-
-void IEvent::Finish() { SetFinishedAt(GetTimeNow()); }
-
-bool IEvent::IsChildOf(const IEvent* e) {
-  if (!e) { return false; }
-  if (this == e) { return false; }
-  return GetStartedAt<double>() >= e->GetStartedAt<double>()
-         && GetFinishedAt<double>() <= e->GetFinishedAt<double>();
-}
-
-const std::string& IEvent::GetName() const { return name_; }
-
-std::string CustomEvent::Key() { return name_; }
-
-nlohmann::json CustomEvent::ToJson() {
-  auto j = IEvent::ToJson();
-  j["type"] = EventType::kCustom;
-  j["custom_type"] = type_;
-  return j;
-}
-
-std::shared_ptr<CustomEvent> CustomEvent::Create(const std::string& name, CustomEventType type) {
-  return std::shared_ptr<CustomEvent>(new CustomEvent(name, type));
-}
-
-// std::string KernelEvent::Key() { return fmt::format("{}.{}", name_, GetFormatedInputShapes()); }
-std::string KernelEvent::Key() { return "yuguo"; }
-
-nlohmann::json KernelEvent::ToJson() {
-  auto j = IEvent::ToJson();
-  j["type"] = EventType::kOneflowKernel;
-  j["input_shapes"] = GetFormatedInputShapes();
-#if defined(WITH_CUDA)
-  j["memory_size"] = memory_size_;
-  if (!children_.empty()) { j["children"] = children_; }
-#endif  // WITH_CUDA
-  return j;
-}
-
-std::shared_ptr<KernelEvent> KernelEvent::Create(
-    const std::string& name, const std::function<std::vector<ShapeView>(void)>& shape_getter) {
-  return std::shared_ptr<KernelEvent>(new KernelEvent(name, shape_getter));
-}
-
-void KernelEvent::RecordShape(const ShapeView& shape) { input_shapes_.emplace_back(shape); }
-
-std::string KernelEvent::GetFormatedInputShapes(size_t max_num_to_format) {
-  if (input_shapes_.size() == 0) { return "-"; }
-  std::vector<std::string> shapes_formated(std::min(input_shapes_.size(), max_num_to_format));
-  for (auto i = 0; i < shapes_formated.size(); ++i) {
-    const std::string current_shape = input_shapes_[i].ToString();
-    shapes_formated[i] = current_shape == "()" ? "scalar" : current_shape;
-  }
-  if (input_shapes_.size() > max_num_to_format) { shapes_formated.emplace_back("..."); }
-  // return fmt::format("[{}]", fmt::join(shapes_formated, ", "));
-  return "yuguo";
-}
-
-}  // namespace profiler
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+#include "fmt/core.h"
+#include "fmt/format.h"
+#include "oneflow/core/profiler/event.h"
+#include "oneflow/core/profiler/util.h"
+
+using json = nlohmann::json;
+
+namespace oneflow {
+
+namespace profiler {
+nlohmann::json IEvent::ToJson() {
+  return json{{"name", name_}, {"time", GetDuration<double>()}, {"input_shapes", "-"}};
+}
+
+void IEvent::SetStartedAt(double t) { started_at_ = t; }
+
+void IEvent::SetFinishedAt(double t) { finished_at_ = t; }
+
+void IEvent::Start() { SetStartedAt(GetTimeNow()); }
+
+void IEvent::Finish() { SetFinishedAt(GetTimeNow()); }
+
+bool IEvent::IsChildOf(const IEvent* e) {
+  if (!e) { return false; }
+  if (this == e) { return false; }
+  return GetStartedAt<double>() >= e->GetStartedAt<double>()
+         && GetFinishedAt<double>() <= e->GetFinishedAt<double>();
+}
+
+const std::string& IEvent::GetName() const { return name_; }
+
+std::string CustomEvent::Key() { return name_; }
+
+nlohmann::json CustomEvent::ToJson() {
+  auto j = IEvent::ToJson();
+  j["type"] = EventType::kCustom;
+  j["custom_type"] = type_;
+  return j;
+}
+
+std::shared_ptr<CustomEvent> CustomEvent::Create(const std::string& name, CustomEventType type) {
+  return std::shared_ptr<CustomEvent>(new CustomEvent(name, type));
+}
+
+std::string KernelEvent::Key() { return fmt::format("{}.{}", name_, GetFormatedInputShapes()); }
+
+nlohmann::json KernelEvent::ToJson() {
+  auto j = IEvent::ToJson();
+  j["type"] = EventType::kOneflowKernel;
+  j["input_shapes"] = GetFormatedInputShapes();
+#if defined(WITH_CUDA) || defined(WITH_ROCM)
+  j["memory_size"] = memory_size_;
+  if (!children_.empty()) { j["children"] = children_; }
+#endif  // WITH_CUDA
+  return j;
+}
+
+std::shared_ptr<KernelEvent> KernelEvent::Create(
+    const std::string& name, const std::function<std::vector<Shape>(void)>& shape_getter) {
+  return std::shared_ptr<KernelEvent>(new KernelEvent(name, shape_getter));
+}
+
+std::string KernelEvent::GetFormatedInputShapes(size_t max_num_to_format) {
+  if (input_shapes_.size() == 0) { return "-"; }
+  std::vector<std::string> shapes_formated(std::min(input_shapes_.size(), max_num_to_format));
+  for (auto i = 0; i < shapes_formated.size(); ++i) {
+    const std::string current_shape = input_shapes_[i].ToString();
+    shapes_formated[i] = current_shape == "()" ? "scalar" : current_shape;
+  }
+  if (input_shapes_.size() > max_num_to_format) { shapes_formated.emplace_back("..."); }
+  return fmt::format("[{}]", fmt::join(shapes_formated, ", "));
+}
+
+}  // namespace profiler
 }  // namespace oneflow
\ No newline at end of file
--- a/oneflow/core/profiler/event.h
+++ b/oneflow/core/profiler/event.h
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#ifndef ONEFLOW_CORE_PROFILER_EVENT_H_
-#define ONEFLOW_CORE_PROFILER_EVENT_H_
-
-#include <functional>
-#include <memory>
-#include <vector>
-#include "nlohmann/json.hpp"
-#include "oneflow/core/common/util.h"
-#include "oneflow/core/common/shape_view.h"
-
-namespace oneflow {
-
-namespace profiler {
-
-class ProfileManager;
-
-enum class EventType {
-  kCustom,        // has three kinds
-  kOneflowKernel  // OneFlow cpu/cuda kernel
-};
-enum class CustomEventType {
-  kDefault,     // for record_function
-  kCudaKernel,  // cuda kernel
-  kCudaRuntime  // something like cudaLaunchKernel
-};
-enum class EventTimeUnit { kNS, kUS };
-
-class IEvent {
- public:
-  OF_DISALLOW_COPY_AND_MOVE(IEvent);
-
-  IEvent() = delete;
-  IEvent(const std::string& name, EventTimeUnit time_unit) : name_(name), time_unit_(time_unit) {}
-
-  virtual std::string Key() = 0;
-  virtual nlohmann::json ToJson();
-  virtual ~IEvent() = default;
-
-  virtual void Start();
-  virtual void Finish();
-  bool IsChildOf(const IEvent* e);
-
-  const std::string& GetName() const;
-  template<typename T>
-  const T GetDuration(EventTimeUnit time_unit = EventTimeUnit::kUS) const;
-  template<typename T>
-  const T GetStartedAt(EventTimeUnit time_unit = EventTimeUnit::kUS) const;
-  template<typename T>
-  const T GetFinishedAt(EventTimeUnit time_unit = EventTimeUnit::kUS) const;
-
- protected:
-  virtual void SetStartedAt(double t);
-  virtual void SetFinishedAt(double t);
-
-  std::string name_;
-  EventTimeUnit time_unit_;
-  double started_at_ = 0;
-  double finished_at_ = 0;
-};
-
-inline double ConvertTime(double time_, EventTimeUnit src_time_unit, EventTimeUnit dst_time_unit) {
-  if (src_time_unit == EventTimeUnit::kNS && dst_time_unit == EventTimeUnit::kUS) {
-    return time_ / 1000;
-  }
-  if (src_time_unit == EventTimeUnit::kUS && dst_time_unit == EventTimeUnit::kNS) {
-    return time_ * 1000;
-  }
-  return time_;
-}
-
-template<>
-const inline double IEvent::GetStartedAt<double>(EventTimeUnit time_unit) const {
-  return ConvertTime(started_at_, time_unit_, time_unit);
-}
-
-template<>
-const inline time_t IEvent::GetStartedAt<time_t>(EventTimeUnit time_unit) const {
-  return static_cast<time_t>(GetStartedAt<double>(time_unit));
-}
-
-template<>
-const inline double IEvent::GetFinishedAt<double>(EventTimeUnit time_unit) const {
-  return ConvertTime(finished_at_, time_unit_, time_unit);
-}
-
-template<>
-const inline time_t IEvent::GetFinishedAt<time_t>(EventTimeUnit time_unit) const {
-  return static_cast<time_t>(GetFinishedAt<double>(time_unit));
-}
-
-template<>
-const inline double IEvent::GetDuration<double>(EventTimeUnit time_unit) const {
-  return GetFinishedAt<double>(time_unit) - GetStartedAt<double>(time_unit);
-}
-
-template<>
-const inline time_t IEvent::GetDuration<time_t>(EventTimeUnit time_unit) const {
-  return static_cast<time_t>(GetDuration<double>(time_unit));
-}
-
-class CustomEvent final : public IEvent {
- public:
-  friend class ProfileManager;
-  std::string Key() override;
-
-  nlohmann::json ToJson() override;
-
-  static std::shared_ptr<CustomEvent> Create(const std::string& name,
-                                             CustomEventType type = CustomEventType::kDefault);
-
- private:
-  CustomEventType type_;
-  CustomEvent(const std::string& custom_name, CustomEventType type)
-      : IEvent(custom_name,
-               type == CustomEventType::kDefault ? EventTimeUnit::kNS : EventTimeUnit::kUS),
-        type_(type) {}
-};
-
-class KernelEvent final : public IEvent {
- public:
-  std::string Key() override;
-
-  nlohmann::json ToJson() override;
-
-  static std::shared_ptr<KernelEvent> Create(
-      const std::string& name, const std::function<std::vector<ShapeView>(void)>& shape_getter);
-
-  void RecordShape(const ShapeView& shape);
-
-#if defined(WITH_CUDA)
-  void SetMemorySize(int64_t memory_size) { memory_size_ = memory_size; }
-  void AddChildEvent(const std::shared_ptr<IEvent>& e) { children_.emplace(e); }
-  bool AddChildEventIfSo(const std::shared_ptr<IEvent>& e) {
-    if (e->IsChildOf(dynamic_cast<IEvent*>(this))) {
-      children_.emplace(e);
-      return true;
-    }
-    return false;
-  }
-  bool HasChildEvent(const std::shared_ptr<IEvent>& e) { return children_.count(e); }
-  void WalkAmongChildren(const std::function<void(const std::shared_ptr<IEvent>& e)>& f) const {
-    for (const auto& x : children_) { f(x); }
-  }
-#endif  // WITH_CUDA
-
- private:
-  KernelEvent(const std::string& kernel_name,
-              const std::function<std::vector<ShapeView>(void)>& shape_getter)
-      : IEvent(kernel_name, EventTimeUnit::kNS) {
-    if (shape_getter) { input_shapes_ = shape_getter(); }
-  }
-
-#if defined(WITH_CUDA)
-  int64_t memory_size_ = -1;
-  std::set<std::shared_ptr<IEvent>> children_;
-#endif  // WITH_CUDA
-
-  std::vector<ShapeView> input_shapes_;
-  std::string GetFormatedInputShapes(size_t max_num_to_format = 4);
-};
-
-}  // namespace profiler
-}  // namespace oneflow
-
-namespace nlohmann {
-
-inline void to_json(json& j, const std::shared_ptr<::oneflow::profiler::IEvent>& event) {
-  j = event->ToJson();
-}
-
-}  // namespace nlohmann
-
-#endif  // ONEFLOW_CORE_PROFILER_EVENT_H_
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#ifndef ONEFLOW_CORE_PROFILER_EVENT_H_
+#define ONEFLOW_CORE_PROFILER_EVENT_H_
+
+#include <functional>
+#include <memory>
+#include <vector>
+#include "nlohmann/json.hpp"
+#include "oneflow/core/common/util.h"
+#include "oneflow/core/common/shape_view.h"
+
+namespace oneflow {
+
+namespace profiler {
+
+class ProfileManager;
+
+enum class EventType {
+  kCustom,        // has three kinds
+  kOneflowKernel  // OneFlow cpu/cuda kernel
+};
+enum class CustomEventType {
+  kDefault,     // for record_function
+  kCudaKernel,  // cuda kernel
+  kCudaRuntime  // something like cudaLaunchKernel
+};
+enum class EventTimeUnit { kNS, kUS };
+
+class IEvent {
+ public:
+  OF_DISALLOW_COPY_AND_MOVE(IEvent);
+
+  IEvent() = delete;
+  IEvent(const std::string& name, EventTimeUnit time_unit) : name_(name), time_unit_(time_unit) {}
+
+  virtual std::string Key() = 0;
+  virtual nlohmann::json ToJson();
+  virtual ~IEvent() = default;
+
+  virtual void Start();
+  virtual void Finish();
+  bool IsChildOf(const IEvent* e);
+
+  const std::string& GetName() const;
+  template<typename T>
+  const T GetDuration(EventTimeUnit time_unit = EventTimeUnit::kUS) const;
+  template<typename T>
+  const T GetStartedAt(EventTimeUnit time_unit = EventTimeUnit::kUS) const;
+  template<typename T>
+  const T GetFinishedAt(EventTimeUnit time_unit = EventTimeUnit::kUS) const;
+
+ protected:
+  virtual void SetStartedAt(double t);
+  virtual void SetFinishedAt(double t);
+
+  std::string name_;
+  EventTimeUnit time_unit_;
+  double started_at_ = 0;
+  double finished_at_ = 0;
+};
+
+inline double ConvertTime(double time_, EventTimeUnit src_time_unit, EventTimeUnit dst_time_unit) {
+  if (src_time_unit == EventTimeUnit::kNS && dst_time_unit == EventTimeUnit::kUS) {
+    return time_ / 1000;
+  }
+  if (src_time_unit == EventTimeUnit::kUS && dst_time_unit == EventTimeUnit::kNS) {
+    return time_ * 1000;
+  }
+  return time_;
+}
+
+template<>
+const inline double IEvent::GetStartedAt<double>(EventTimeUnit time_unit) const {
+  return ConvertTime(started_at_, time_unit_, time_unit);
+}
+
+template<>
+const inline time_t IEvent::GetStartedAt<time_t>(EventTimeUnit time_unit) const {
+  return static_cast<time_t>(GetStartedAt<double>(time_unit));
+}
+
+template<>
+const inline double IEvent::GetFinishedAt<double>(EventTimeUnit time_unit) const {
+  return ConvertTime(finished_at_, time_unit_, time_unit);
+}
+
+template<>
+const inline time_t IEvent::GetFinishedAt<time_t>(EventTimeUnit time_unit) const {
+  return static_cast<time_t>(GetFinishedAt<double>(time_unit));
+}
+
+template<>
+const inline double IEvent::GetDuration<double>(EventTimeUnit time_unit) const {
+  return GetFinishedAt<double>(time_unit) - GetStartedAt<double>(time_unit);
+}
+
+template<>
+const inline time_t IEvent::GetDuration<time_t>(EventTimeUnit time_unit) const {
+  return static_cast<time_t>(GetDuration<double>(time_unit));
+}
+
+class CustomEvent final : public IEvent {
+ public:
+  friend class ProfileManager;
+  std::string Key() override;
+
+  nlohmann::json ToJson() override;
+
+  static std::shared_ptr<CustomEvent> Create(const std::string& name,
+                                             CustomEventType type = CustomEventType::kDefault);
+
+ private:
+  CustomEventType type_;
+  CustomEvent(const std::string& custom_name, CustomEventType type)
+      : IEvent(custom_name,
+               type == CustomEventType::kDefault ? EventTimeUnit::kNS : EventTimeUnit::kUS),
+        type_(type) {}
+};
+
+class KernelEvent final : public IEvent {
+ public:
+  std::string Key() override;
+
+  nlohmann::json ToJson() override;
+
+  static std::shared_ptr<KernelEvent> Create(
+      const std::string& name, const std::function<std::vector<Shape>(void)>& shape_getter);
+
+#if defined(WITH_CUDA) || defined(WITH_ROCM)
+  void SetMemorySize(int64_t memory_size) { memory_size_ = memory_size; }
+  void AddChildEvent(const std::shared_ptr<IEvent>& e) { children_.emplace(e); }
+  bool AddChildEventIfSo(const std::shared_ptr<IEvent>& e) {
+    if (e->IsChildOf(dynamic_cast<IEvent*>(this))) {
+      children_.emplace(e);
+      return true;
+    }
+    return false;
+  }
+  bool HasChildEvent(const std::shared_ptr<IEvent>& e) { return children_.count(e); }
+  void WalkAmongChildren(const std::function<void(const std::shared_ptr<IEvent>& e)>& f) const {
+    for (const auto& x : children_) { f(x); }
+  }
+#endif  // WITH_CUDA
+
+ private:
+  KernelEvent(const std::string& kernel_name,
+              const std::function<std::vector<Shape>(void)>& shape_getter)
+      : IEvent(kernel_name, EventTimeUnit::kNS) {
+    if (shape_getter) { input_shapes_ = shape_getter(); }
+  }
+
+#if defined(WITH_CUDA) || defined(WITH_ROCM)
+  int64_t memory_size_ = -1;
+  std::set<std::shared_ptr<IEvent>> children_;
+#endif  // WITH_CUDA
+
+  std::vector<Shape> input_shapes_;
+  std::string GetFormatedInputShapes(size_t max_num_to_format = 4);
+};
+
+}  // namespace profiler
+}  // namespace oneflow
+
+namespace nlohmann {
+
+inline void to_json(json& j, const std::shared_ptr<::oneflow::profiler::IEvent>& event) {
+  j = event->ToJson();
+}
+
+}  // namespace nlohmann
+
+#endif  // ONEFLOW_CORE_PROFILER_EVENT_H_
--- a/oneflow/core/profiler/event_recorder.cpp
+++ b/oneflow/core/profiler/event_recorder.cpp
@@ -32,13 +32,13 @@ std::shared_ptr<EventRecorder> EventRecorder::CreateCustomEventRecorder(const st

 Maybe<EventRecorder> EventRecorder::CreateKernelEventRecorder(
    const std::string& name,
-#if defined(WITH_CUDA)
+#if defined(WITH_CUDA) || defined(WITH_ROCM)
    const std::function<int64_t()>& memory_size_getter,
 #endif
    const ShapeGetterFuncType& shape_getter) {
  auto pmgr = Singleton<ProfileManager>::Get();
  if (pmgr) {
-#if defined(WITH_CUDA)
+#if defined(WITH_CUDA) || defined(WITH_ROCM)
    if (pmgr->use_cpu_ || pmgr->use_cuda_) {
      auto event = KernelEvent::Create(name, pmgr->record_shapes_ ? shape_getter : nullptr);
      if (pmgr->use_cuda_) {

--- a/oneflow/core/profiler/event_recorder.h
+++ b/oneflow/core/profiler/event_recorder.h
-/*
-Copyright 2020 The OneFlow Authors. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-#ifndef ONEFLOW_CORE_PROFILER_EVENT_RECORDER_H_
-#define ONEFLOW_CORE_PROFILER_EVENT_RECORDER_H_
-
-#include "oneflow/core/common/util.h"
-#include "oneflow/core/profiler/event.h"
-
-namespace oneflow {
-namespace profiler {
-
-class EventRecorder {
- public:
-  using ShapeGetterFuncType = std::function<std::vector<ShapeView>(void)>;
-
-  OF_DISALLOW_COPY_AND_MOVE(EventRecorder);
-
-  explicit EventRecorder(const std::shared_ptr<IEvent>& event) : event_(event) {
-    CHECK_JUST(RegisterEventToProfileManager(event));
-    event_->Start();
-  }
-
-  Maybe<void> RegisterEventToProfileManager(const std::shared_ptr<IEvent>& event);
-
-  ~EventRecorder() {
-    if (event_) {
-      event_->Finish();
-      event_.reset();
-    }
-  }
-  static std::shared_ptr<EventRecorder> CreateCustomEventRecorder(const std::string& name);
-
-  static Maybe<EventRecorder> CreateKernelEventRecorder(
-      const std::string& name,
-#if defined(WITH_CUDA)
-      const std::function<int64_t()>& memory_size_getter,
-#endif
-      const ShapeGetterFuncType& shape_getter);
-
- private:
-  std::shared_ptr<IEvent> event_;
-};
-
-}  // namespace profiler
-}  // namespace oneflow
-
-#endif  // ONEFLOW_CORE_PROFILER_EVENT_RECORDER_H_
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#ifndef ONEFLOW_CORE_PROFILER_EVENT_RECORDER_H_
+#define ONEFLOW_CORE_PROFILER_EVENT_RECORDER_H_
+
+#include "oneflow/core/common/util.h"
+#include "oneflow/core/profiler/event.h"
+
+namespace oneflow {
+namespace profiler {
+
+class EventRecorder {
+ public:
+  using ShapeGetterFuncType = std::function<std::vector<Shape>(void)>;
+
+  OF_DISALLOW_COPY_AND_MOVE(EventRecorder);
+
+  explicit EventRecorder(const std::shared_ptr<IEvent>& event) : event_(event) {
+    CHECK_JUST(RegisterEventToProfileManager(event));
+    event_->Start();
+  }
+
+  Maybe<void> RegisterEventToProfileManager(const std::shared_ptr<IEvent>& event);
+
+  ~EventRecorder() {
+    if (event_) {
+      event_->Finish();
+      event_.reset();
+    }
+  }
+  static std::shared_ptr<EventRecorder> CreateCustomEventRecorder(const std::string& name);
+
+  static Maybe<EventRecorder> CreateKernelEventRecorder(
+      const std::string& name,
+#if defined(WITH_CUDA) || defined(WITH_ROCM)
+      const std::function<int64_t()>& memory_size_getter,
+#endif
+      const ShapeGetterFuncType& shape_getter);
+
+ private:
+  std::shared_ptr<IEvent> event_;
+};
+
+}  // namespace profiler
+}  // namespace oneflow
+
+#endif  // ONEFLOW_CORE_PROFILER_EVENT_RECORDER_H_
--- a/oneflow/core/profiler/kernel.cpp
+++ b/oneflow/core/profiler/kernel.cpp
@@ -17,7 +17,11 @@ limitations under the License.
 #include "oneflow/core/profiler/kernel.h"
 #include "oneflow/core/profiler/profiler.h"
 #include "oneflow/core/kernel/kernel.h"
+#ifdef WITH_ROCM
+#include "oneflow/core/ep/rocm/cuda_stream.h"
+#else
 #include "oneflow/core/ep/cuda/cuda_stream.h"
+#endif
 #include "oneflow/core/lazy/actor/actor_context.h"

 namespace oneflow {
@@ -43,6 +47,11 @@ thread_local cudaEvent_t cuda_memory_bandwidth_profile_start_event = nullptr;
 thread_local cudaEvent_t cuda_memory_bandwidth_profile_end_event = nullptr;
 #endif  // WITH_CUDA

+#if defined(WITH_ROCM)
+thread_local hipEvent_t cuda_memory_bandwidth_profile_start_event = nullptr;
+thread_local hipEvent_t cuda_memory_bandwidth_profile_end_event = nullptr;
+#endif  // WITH_ROCM
+
 }  // namespace

 void TraceKernelForwardDataContentStart(KernelContext* kernel_ctx, const Kernel* kernel) {
@@ -61,6 +70,22 @@ void TraceKernelForwardDataContentStart(KernelContext* kernel_ctx, const Kernel*
  }
  if (profile_kernel_forward_range) { OF_PROFILER_RANGE_PUSH(kernel->op_conf().name()); }
 #endif  // WITH_CUDA
+#if defined(WITH_ROCM)
+  if (profile_cuda_memory_bandwidth) {
+    auto* actor_context_provider = dynamic_cast<ActorContextProvider*>(kernel_ctx);
+    auto* cuda_stream = dynamic_cast<ep::CudaStream*>(kernel_ctx->stream());
+    if (cuda_stream != nullptr && actor_context_provider != nullptr) {
+      CHECK(cuda_memory_bandwidth_profile_start_event == nullptr);
+      CHECK(cuda_memory_bandwidth_profile_end_event == nullptr);
+      OF_CUDA_CHECK(hipEventCreate(&cuda_memory_bandwidth_profile_start_event));
+      OF_CUDA_CHECK(hipEventCreate(&cuda_memory_bandwidth_profile_end_event));
+      OF_CUDA_CHECK(
+          hipEventRecord(cuda_memory_bandwidth_profile_start_event, cuda_stream->cuda_stream()));
+    }
+  }
+  if (profile_kernel_forward_range) { OF_PROFILER_RANGE_PUSH(kernel->op_conf().name()); }
+
+#endif  // WITH_ROCM
 }

 void TraceKernelForwardDataContentEnd(KernelContext* kernel_ctx, const Kernel* kernel) {
@@ -103,6 +128,45 @@ void TraceKernelForwardDataContentEnd(KernelContext* kernel_ctx, const Kernel* k
    }
  }
 #endif  // WITH_CUDA
+#if defined(WITH_ROCM)
+  if (profile_kernel_forward_range) { OF_PROFILER_RANGE_POP(); }
+  // The memory bandwidth profiler only works in lazy mode.
+  if (profile_cuda_memory_bandwidth) {
+    auto* cuda_stream = dynamic_cast<ep::CudaStream*>(kernel_ctx->stream());
+    auto* actor_context_provider = dynamic_cast<ActorContextProvider*>(kernel_ctx);
+    if (cuda_stream != nullptr && actor_context_provider != nullptr) {
+      hipEvent_t start_event = cuda_memory_bandwidth_profile_start_event;
+      hipEvent_t end_event = cuda_memory_bandwidth_profile_end_event;
+      cuda_memory_bandwidth_profile_start_event = nullptr;
+      cuda_memory_bandwidth_profile_end_event = nullptr;
+      CHECK_NOTNULL(start_event);
+      CHECK_NOTNULL(end_event);
+      OF_CUDA_CHECK(hipEventRecord(end_event, cuda_stream->cuda_stream()));
+      int64_t memory_size = 0;
+      for (const auto& bn : kernel->op_attribute().input_bns()) {
+        const Blob* blob = kernel_ctx->BnInOp2Blob(bn);
+        if (blob) { memory_size += blob->ByteSizeOfBlobBody(); }
+      }
+      for (const auto& bn : kernel->op_attribute().output_bns()) {
+        const Blob* blob = kernel_ctx->BnInOp2Blob(bn);
+        if (blob) { memory_size += blob->ByteSizeOfBlobBody(); }
+      }
+      const std::string op_name = kernel->op_conf().name();
+      actor_context_provider->GetActorContext()->AddCallback(
+          [start_event, end_event, memory_size, op_name]() {
+            float elapsed_ms = 0;
+            OF_CUDA_CHECK(hipEventElapsedTime(&elapsed_ms, start_event, end_event));
+            OF_CUDA_CHECK(hipEventDestroy(start_event));
+            OF_CUDA_CHECK(hipEventDestroy(end_event));
+            double bandwidth =
+                static_cast<double>(memory_size) / (1024.0 * 1024.0 * 1024.0) / (elapsed_ms / 1000);
+            LOG(INFO) << "PROFILER::KERNEL::CUDA_MEMORY_BANDWIDTH op_name: " << op_name
+                      << " elapsed(ms): " << elapsed_ms << " memory_size(Byte): " << memory_size
+                      << " bandwidth(GB/s): " << bandwidth;
+          });
+    }
+  }
+#endif  // WITH_ROCM
 }

 }  // namespace profiler

--- a/oneflow/core/profiler/kineto_shim.cpp
+++ b/oneflow/core/profiler/kineto_shim.cpp
@@ -14,7 +14,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 */

-#if defined(WITH_CUDA)
+#if defined(WITH_CUDA) || defined(WITH_ROCM)

 #include "oneflow/core/profiler/kineto_shim.h"
 #include "libkineto.h"

--- a/oneflow/core/profiler/kineto_shim.h
+++ b/oneflow/core/profiler/kineto_shim.h
@@ -16,7 +16,7 @@ limitations under the License.
 #ifndef ONEFLOW_CORE_PROFILER_KINETO_SHIM_H_
 #define ONEFLOW_CORE_PROFILER_KINETO_SHIM_H_

-#if defined(WITH_CUDA)
+#if defined(WITH_CUDA) || defined(WITH_ROCM)

 #include <string>
 #include <memory>

--- a/oneflow/core/profiler/profile_manager.cpp
+++ b/oneflow/core/profiler/profile_manager.cpp
@@ -15,12 +15,12 @@ limitations under the License.
 */
 #include <memory>
 #include <unordered_map>
-// #include "fmt/core.h"
+#include "fmt/core.h"
 #include "nlohmann/json.hpp"
 #include "oneflow/core/profiler/kineto_shim.h"
 #include "oneflow/core/profiler/profile_manager.h"
 #include "oneflow/core/profiler/event.h"
-#if defined(WITH_CUDA)
+#if defined(WITH_CUDA) || defined(WITH_ROCM)
 #include <libkineto.h>
 #endif  // WITH_CUDA

@@ -48,7 +48,7 @@ std::string ProfileManager::DumpResultsJson() {
 }

 std::vector<std::shared_ptr<IEvent>> ProfileManager::ExportEvents() {
-#if defined(WITH_CUDA)
+#if defined(WITH_CUDA) || defined(WITH_ROCM)
  auto trace = StopTrace();
  const auto& kineto_events = *(trace.get()->activities());
  std::set<std::shared_ptr<IEvent>> custom_events;
@@ -77,7 +77,7 @@ std::vector<std::shared_ptr<IEvent>> ProfileManager::ExportEvents() {
  while (!events_.empty()) {
    auto evt = events_.front();
    events_.pop();
-#if defined(WITH_CUDA)
+#if defined(WITH_CUDA) || defined(WITH_ROCM)
    auto evt_kernel = std::dynamic_pointer_cast<KernelEvent>(evt);
    if (evt_kernel) {
      std::set<int64_t> current_corr_ids;
@@ -106,8 +106,7 @@ std::string ProfileManager::GetNextEventRecorderKey(const std::string& name) {
  } else {
    event_recorders_last_id_[name]++;
  }
-  // return fmt::format("{}.{}", name, event_recorders_last_id_[name]);
-  return "yuguo";
+  return fmt::format("{}.{}", name, event_recorders_last_id_[name]);
 }

 }  // namespace profiler

--- a/oneflow/core/profiler/profile_manager.h
+++ b/oneflow/core/profiler/profile_manager.h
@@ -37,7 +37,7 @@ class ProfileManager {
        use_cuda_(use_cuda),
        record_shapes_(record_shapes),
        record_bandwidth_(record_bandwidth) {
-#if defined(WITH_CUDA)
+#if defined(WITH_CUDA) || defined(WITH_ROCM)
    std::set<ActivityType> activities{};
    if (use_cpu) { activities.insert(ActivityType::CPU); }
    if (use_cuda) { activities.insert(ActivityType::CUDA); }

--- a/oneflow/core/profiler/profiler.cpp
+++ b/oneflow/core/profiler/profiler.cpp
@@ -20,11 +20,20 @@ limitations under the License.
 #include "oneflow/core/profiler/event_recorder.h"
 #include "oneflow/core/vm/vm_util.h"
 #ifdef OF_ENABLE_PROFILER
+#ifdef WITH_ROCM
+#include <hip/hip_runtime.h>
+#include <hip/hip_profile.h>
+#include <roctracer_roctx.h>
+#include <sys/syscall.h>
+#include <iostream>
+#include "oneflow/core/device/cuda_util.h"
+#else
 #include <nvtx3/nvToolsExt.h>
 #include <sys/syscall.h>
 #include <iostream>
 #include <cuda_profiler_api.h>
 #include "oneflow/core/device/cuda_util.h"
+#endif
 #endif  // OF_ENABLE_PROFILER

 namespace oneflow {
@@ -33,6 +42,16 @@ namespace profiler {

 void NameThisHostThread(const std::string& name) {
 #ifdef OF_ENABLE_PROFILER
+#ifdef WITH_ROCM
+  static thread_local std::unique_ptr<std::string> thread_name_prefix;
+  if (!thread_name_prefix) {
+    thread_name_prefix.reset(
+        new std::string(GetStringFromEnv("ONEFLOW_PROFILER_HOST_THREAD_NAME_PREFIX", "")));
+  }
+  const std::string name_with_prefix = *thread_name_prefix + name;
+  // nvtxNameOsThreadA(syscall(SYS_gettid), name_with_prefix.c_str());
+  roctxMarkA(name_with_prefix.c_str());
+#else
  static thread_local std::unique_ptr<std::string> thread_name_prefix;
  if (!thread_name_prefix) {
    thread_name_prefix.reset(
@@ -40,18 +59,27 @@ void NameThisHostThread(const std::string& name) {
  }
  const std::string name_with_prefix = *thread_name_prefix + name;
  nvtxNameOsThreadA(syscall(SYS_gettid), name_with_prefix.c_str());
+#endif
 #endif  // OF_ENABLE_PROFILER
 }

 void RangePush(const std::string& name) {
 #ifdef OF_ENABLE_PROFILER
+#ifdef WITH_ROCM
+  roctxRangePushA(name.c_str());
+#else
  nvtxRangePushA(name.c_str());
+#endif
 #endif  // OF_ENABLE_PROFILER
 }

 void RangePop() {
 #ifdef OF_ENABLE_PROFILER
+#ifdef WITH_ROCM
+  roctxRangePop();
+#else
  nvtxRangePop();
+#endif
 #endif  // OF_ENABLE_PROFILER
 }

@@ -82,13 +110,21 @@ void LogHostMemoryUsage(const std::string& name) {

 void ProfilerStart() {
 #ifdef OF_ENABLE_PROFILER
+#ifdef WITH_ROCM
+  OF_CUDA_CHECK(hipProfilerStart());
+#else
  OF_CUDA_CHECK(cudaProfilerStart());
+#endif
 #endif  // OF_ENABLE_PROFILER
 }

 void ProfilerStop() {
 #ifdef OF_ENABLE_PROFILER
+#ifdef WITH_ROCM
+  OF_CUDA_CHECK(hipProfilerStop());
+#else
  OF_CUDA_CHECK(cudaProfilerStop());
+#endif
 #endif  // OF_ENABLE_PROFILER
 }

@@ -105,6 +141,9 @@ Maybe<std::string> DisableProfilerAndReturnResult() {
 #if defined(WITH_CUDA)
  OF_CUDA_CHECK(cudaDeviceSynchronize());
 #endif  // WITH_CUDA
+#if defined(WITH_ROCM)
+  OF_CUDA_CHECK(hipDeviceSynchronize());
+#endif  // WITH_ROCM
  auto* pmgr = JUST(SingletonMaybe<ProfileManager>());
  std::string results = pmgr->DumpResultsJson();
  Singleton<ProfileManager>::Delete();

--- a/oneflow/user/kernels/math_unary_elementwise_func.h
+++ b/oneflow/user/kernels/math_unary_elementwise_func.h
--- a/oneflow/user/kernels/nvtx_range_kernel.hip.cpp
+++ b/oneflow/user/kernels/nvtx_range_kernel.hip.cpp
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "oneflow/core/framework/framework.h"
+#include "oneflow/core/kernel/new_kernel_util.h"
+
+#ifdef OF_ENABLE_PROFILER
+#include <roctracer_roctx.h>
+#endif  // OF_ENABLE_PROFILER
+
+namespace oneflow {
+
+namespace {
+
+#ifdef OF_ENABLE_PROFILER
+static thread_local HashMap<std::string, roctx_range_id_t> mark2range_id;
+#endif
+
+}  // namespace
+
+class NvtxOpKernelState final : public user_op::OpKernelState {
+ public:
+  NvtxOpKernelState() : counter_(0) {
+#ifndef OF_ENABLE_PROFILER
+    LOG(WARNING) << "To use NVTX, run cmake with -DBUILD_PROFILER=ON";
+#endif
+  }
+  ~NvtxOpKernelState() override = default;
+
+  int64_t counter() const { return counter_; }
+  void IncreaseCount() { counter_ += 1; }
+
+ private:
+  int64_t counter_;
+};
+
+class NvtxStartKernel final : public user_op::OpKernel {
+ public:
+  NvtxStartKernel() = default;
+  ~NvtxStartKernel() override = default;
+
+  std::shared_ptr<user_op::OpKernelState> CreateOpKernelState(
+      user_op::KernelInitContext* ctx) const override {
+    return std::make_shared<NvtxOpKernelState>();
+  }
+
+ private:
+  using user_op::OpKernel::Compute;
+  void Compute(user_op::KernelComputeContext* ctx, user_op::OpKernelState* state,
+               const user_op::OpKernelCache*) const override {
+    const user_op::Tensor* in = ctx->Tensor4ArgNameAndIndex("in", 0);
+    user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0);
+    const ShapeView& in_shape = in->shape_view();
+    CHECK_EQ(out->shape_view(), in_shape);
+    const DataType in_data_type = in->data_type();
+    CHECK_EQ(out->data_type(), in_data_type);
+    Memcpy<DeviceType::kCUDA>(ctx->stream(), out->mut_dptr<void>(), in->dptr<void>(),
+                              in_shape.elem_cnt() * GetSizeOfDataType(in_data_type));
+#ifdef OF_ENABLE_PROFILER
+    auto* kernel_state = dynamic_cast<NvtxOpKernelState*>(state);
+    const std::string mark_prefix = ctx->Attr<std::string>("mark_prefix");
+    const std::string mark = mark_prefix + "-" + std::to_string(kernel_state->counter());
+    roctx_range_id_t range_id = roctxRangeStartA(mark.c_str());
+    CHECK(mark2range_id.emplace(mark, range_id).second);
+    kernel_state->IncreaseCount();
+#endif  // OF_ENABLE_PROFILER
+  }
+  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
+};
+
+REGISTER_USER_KERNEL("nvtx_start")
+    .SetCreateFn<NvtxStartKernel>()
+    .SetIsMatchedHob(user_op::HobDeviceType() == DeviceType::kCUDA)
+    .SetInplaceProposalFn([](const user_op::InferContext&,
+                             user_op::AddInplaceArgPair AddInplaceArgPairFn) -> Maybe<void> {
+      OF_RETURN_IF_ERROR(AddInplaceArgPairFn("out", 0, "in", 0, false));
+      return Maybe<void>::Ok();
+    });
+
+class NvtxEndKernel final : public user_op::OpKernel {
+ public:
+  NvtxEndKernel() = default;
+  ~NvtxEndKernel() override = default;
+
+  std::shared_ptr<user_op::OpKernelState> CreateOpKernelState(
+      user_op::KernelInitContext* ctx) const override {
+    return std::make_shared<NvtxOpKernelState>();
+  }
+
+ private:
+  using user_op::OpKernel::Compute;
+  void Compute(user_op::KernelComputeContext* ctx, user_op::OpKernelState* state,
+               const user_op::OpKernelCache*) const override {
+    const user_op::Tensor* in = ctx->Tensor4ArgNameAndIndex("in", 0);
+    user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0);
+    const ShapeView& in_shape = in->shape_view();
+    CHECK_EQ(out->shape_view(), in_shape);
+    const DataType in_data_type = in->data_type();
+    CHECK_EQ(out->data_type(), in_data_type);
+#ifdef OF_ENABLE_PROFILER
+    auto* kernel_state = dynamic_cast<NvtxOpKernelState*>(state);
+    const std::string mark_prefix = ctx->Attr<std::string>("mark_prefix");
+    const std::string mark = mark_prefix + "-" + std::to_string(kernel_state->counter());
+    auto it = mark2range_id.find(mark.c_str());
+    CHECK(it != mark2range_id.end());
+    roctx_range_id_t range_id = it->second;
+    mark2range_id.erase(it);
+    roctxRangeStop(range_id);
+    Memcpy<DeviceType::kCUDA>(ctx->stream(), out->mut_dptr<void>(), in->dptr<void>(),
+                              in_shape.elem_cnt() * GetSizeOfDataType(in_data_type));
+    kernel_state->IncreaseCount();
+#endif
+  }
+  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
+};
+
+REGISTER_USER_KERNEL("nvtx_end")
+    .SetCreateFn<NvtxEndKernel>()
+    .SetIsMatchedHob(user_op::HobDeviceType() == DeviceType::kCUDA)
+    .SetInplaceProposalFn([](const user_op::InferContext&,
+                             user_op::AddInplaceArgPair AddInplaceArgPairFn) -> Maybe<void> {
+      OF_RETURN_IF_ERROR(AddInplaceArgPairFn("out", 0, "in", 0, false));
+      return Maybe<void>::Ok();
+    });
+
+}  // namespace oneflow
--- a/oneflow/user/kernels/stateful_opkernel.cpp
+++ b/oneflow/user/kernels/stateful_opkernel.cpp
--- a/python/oneflow/test/modules/fused_dot_feature_interaction.py
+++ b/python/oneflow/test/modules/fused_dot_feature_interaction.py
+import numpy as np
+import oneflow as flow
+
+def fused_dot_feature_interaction(x,
+                                  y,
+                                  self_interaction=False,
+                                  output_padding=0,
+                                  output_concat=None,
+                                  dtype=flow.float32
+                                  ):
+    # (bs, es) = x.shape
+    (bs, dims, es) = y.shape
+    
+    if self_interaction:
+        offset = 1
+    else:
+        offset = 0
+    li = flow.tensor([i for i in range(dims + 1) for j in range(i + offset)])
+    lj = flow.tensor([j for i in range(dims + 1) for j in range(i + offset)])
+    T = flow.cat(
+        [
+            flow.reshape(x, (bs, 1, es)),
+            y,
+        ],
+        dim=1,
+    )
+    Z = flow.matmul(T, T, transpose_b=True)
+    # gather_nd not support half, so cast to float32
+    Z = flow.cast(Z, flow.float32)
+    Zflat = Z[:, li, lj]
+    Zflat = flow.cast(Zflat, dtype)
+    if output_concat is not None:
+        R = flow.cat([output_concat, Zflat], dim=1)
+    else:
+        R = Zflat
+    if output_padding != 0:
+        padding_tensor = flow.tensor(
+            np.zeros((bs, output_padding)).astype(np.float32),
+            device="cuda",
+            requires_grad=False,
+        )
+        R = flow.cat([R, padding_tensor], dim=1)
+    return R
--- a/python/oneflow/test/profiler/test_profile_lenet.py
+++ b/python/oneflow/test/profiler/test_profile_lenet.py