Commit f262efc9 authored by yuguo's avatar yuguo
Browse files

Surpport profiler for DCU, surpport debug compiler

parent 3f56062c
......@@ -265,9 +265,9 @@ set(ROBIN_HOOD_HASHING_URL
use_mirror(VARIABLE ROBIN_HOOD_HASHING_URL URL ${ROBIN_HOOD_HASHING_URL})
set(ROBIN_HOOD_HASHING_MD5 a78bd30a7582f25984f8592652836467)
set(FMT_URL https://github.com/fmtlib/fmt/archive/48b7e3dafb27ece02cd6addc8bd1041c79d59c2c.zip)
set(FMT_URL https://github.com/fmtlib/fmt/archive/fc07217d85e6dcec52878807d6bbd89a9d9156a5.zip)
use_mirror(VARIABLE FMT_URL URL ${FMT_URL})
set(FMT_MD5 45925a979ed7195e0c88a70be691de09)
set(FMT_MD5 7d9bb2ececc9ede29cd35bdc42a7e22c)
set(KINETO_URL
https://github.com/pytorch/kineto/archive/ff8dba20499a660650632952be76450bd70a52a6.zip)
......
......@@ -175,6 +175,8 @@ if (BUILD_ROCM)
add_definitions(-D__HIP_PLATFORM_HCC__)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D__HIP_PLATFORM_HCC__ --gpu-max-threads-per-block=1024")
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -D__HIP_PLATFORM_HCC__ --gpu-max-threads-per-block=1024")
set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -mcmodel=large")
set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} -mcmodel=large")
list(APPEND oneflow_third_party_libs hip::device)
list(APPEND oneflow_third_party_libs roc::hipblas)
list(APPEND oneflow_third_party_libs hip::hipcub)
......
/*
Copyright 2020 The OneFlow Authors. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
// #include "fmt/core.h"
// #include "fmt/format.h"
#include "oneflow/core/profiler/event.h"
#include "oneflow/core/profiler/util.h"
using json = nlohmann::json;
namespace oneflow {
namespace profiler {
nlohmann::json IEvent::ToJson() {
return json{{"name", name_}, {"time", GetDuration<double>()}, {"input_shapes", "-"}};
}
void IEvent::SetStartedAt(double t) { started_at_ = t; }
void IEvent::SetFinishedAt(double t) { finished_at_ = t; }
void IEvent::Start() { SetStartedAt(GetTimeNow()); }
void IEvent::Finish() { SetFinishedAt(GetTimeNow()); }
bool IEvent::IsChildOf(const IEvent* e) {
if (!e) { return false; }
if (this == e) { return false; }
return GetStartedAt<double>() >= e->GetStartedAt<double>()
&& GetFinishedAt<double>() <= e->GetFinishedAt<double>();
}
const std::string& IEvent::GetName() const { return name_; }
std::string CustomEvent::Key() { return name_; }
nlohmann::json CustomEvent::ToJson() {
auto j = IEvent::ToJson();
j["type"] = EventType::kCustom;
j["custom_type"] = type_;
return j;
}
std::shared_ptr<CustomEvent> CustomEvent::Create(const std::string& name, CustomEventType type) {
return std::shared_ptr<CustomEvent>(new CustomEvent(name, type));
}
// std::string KernelEvent::Key() { return fmt::format("{}.{}", name_, GetFormatedInputShapes()); }
std::string KernelEvent::Key() { return "yuguo"; }
nlohmann::json KernelEvent::ToJson() {
auto j = IEvent::ToJson();
j["type"] = EventType::kOneflowKernel;
j["input_shapes"] = GetFormatedInputShapes();
#if defined(WITH_CUDA)
j["memory_size"] = memory_size_;
if (!children_.empty()) { j["children"] = children_; }
#endif // WITH_CUDA
return j;
}
std::shared_ptr<KernelEvent> KernelEvent::Create(
const std::string& name, const std::function<std::vector<ShapeView>(void)>& shape_getter) {
return std::shared_ptr<KernelEvent>(new KernelEvent(name, shape_getter));
}
void KernelEvent::RecordShape(const ShapeView& shape) { input_shapes_.emplace_back(shape); }
std::string KernelEvent::GetFormatedInputShapes(size_t max_num_to_format) {
if (input_shapes_.size() == 0) { return "-"; }
std::vector<std::string> shapes_formated(std::min(input_shapes_.size(), max_num_to_format));
for (auto i = 0; i < shapes_formated.size(); ++i) {
const std::string current_shape = input_shapes_[i].ToString();
shapes_formated[i] = current_shape == "()" ? "scalar" : current_shape;
}
if (input_shapes_.size() > max_num_to_format) { shapes_formated.emplace_back("..."); }
// return fmt::format("[{}]", fmt::join(shapes_formated, ", "));
return "yuguo";
}
} // namespace profiler
/*
Copyright 2020 The OneFlow Authors. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
#include "fmt/core.h"
#include "fmt/format.h"
#include "oneflow/core/profiler/event.h"
#include "oneflow/core/profiler/util.h"
using json = nlohmann::json;
namespace oneflow {
namespace profiler {
nlohmann::json IEvent::ToJson() {
return json{{"name", name_}, {"time", GetDuration<double>()}, {"input_shapes", "-"}};
}
void IEvent::SetStartedAt(double t) { started_at_ = t; }
void IEvent::SetFinishedAt(double t) { finished_at_ = t; }
void IEvent::Start() { SetStartedAt(GetTimeNow()); }
void IEvent::Finish() { SetFinishedAt(GetTimeNow()); }
bool IEvent::IsChildOf(const IEvent* e) {
if (!e) { return false; }
if (this == e) { return false; }
return GetStartedAt<double>() >= e->GetStartedAt<double>()
&& GetFinishedAt<double>() <= e->GetFinishedAt<double>();
}
const std::string& IEvent::GetName() const { return name_; }
std::string CustomEvent::Key() { return name_; }
nlohmann::json CustomEvent::ToJson() {
auto j = IEvent::ToJson();
j["type"] = EventType::kCustom;
j["custom_type"] = type_;
return j;
}
std::shared_ptr<CustomEvent> CustomEvent::Create(const std::string& name, CustomEventType type) {
return std::shared_ptr<CustomEvent>(new CustomEvent(name, type));
}
std::string KernelEvent::Key() { return fmt::format("{}.{}", name_, GetFormatedInputShapes()); }
nlohmann::json KernelEvent::ToJson() {
auto j = IEvent::ToJson();
j["type"] = EventType::kOneflowKernel;
j["input_shapes"] = GetFormatedInputShapes();
#if defined(WITH_CUDA) || defined(WITH_ROCM)
j["memory_size"] = memory_size_;
if (!children_.empty()) { j["children"] = children_; }
#endif // WITH_CUDA
return j;
}
std::shared_ptr<KernelEvent> KernelEvent::Create(
const std::string& name, const std::function<std::vector<Shape>(void)>& shape_getter) {
return std::shared_ptr<KernelEvent>(new KernelEvent(name, shape_getter));
}
std::string KernelEvent::GetFormatedInputShapes(size_t max_num_to_format) {
if (input_shapes_.size() == 0) { return "-"; }
std::vector<std::string> shapes_formated(std::min(input_shapes_.size(), max_num_to_format));
for (auto i = 0; i < shapes_formated.size(); ++i) {
const std::string current_shape = input_shapes_[i].ToString();
shapes_formated[i] = current_shape == "()" ? "scalar" : current_shape;
}
if (input_shapes_.size() > max_num_to_format) { shapes_formated.emplace_back("..."); }
return fmt::format("[{}]", fmt::join(shapes_formated, ", "));
}
} // namespace profiler
} // namespace oneflow
\ No newline at end of file
/*
Copyright 2020 The OneFlow Authors. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
#ifndef ONEFLOW_CORE_PROFILER_EVENT_H_
#define ONEFLOW_CORE_PROFILER_EVENT_H_
#include <functional>
#include <memory>
#include <vector>
#include "nlohmann/json.hpp"
#include "oneflow/core/common/util.h"
#include "oneflow/core/common/shape_view.h"
namespace oneflow {
namespace profiler {
class ProfileManager;
enum class EventType {
kCustom, // has three kinds
kOneflowKernel // OneFlow cpu/cuda kernel
};
enum class CustomEventType {
kDefault, // for record_function
kCudaKernel, // cuda kernel
kCudaRuntime // something like cudaLaunchKernel
};
enum class EventTimeUnit { kNS, kUS };
class IEvent {
public:
OF_DISALLOW_COPY_AND_MOVE(IEvent);
IEvent() = delete;
IEvent(const std::string& name, EventTimeUnit time_unit) : name_(name), time_unit_(time_unit) {}
virtual std::string Key() = 0;
virtual nlohmann::json ToJson();
virtual ~IEvent() = default;
virtual void Start();
virtual void Finish();
bool IsChildOf(const IEvent* e);
const std::string& GetName() const;
template<typename T>
const T GetDuration(EventTimeUnit time_unit = EventTimeUnit::kUS) const;
template<typename T>
const T GetStartedAt(EventTimeUnit time_unit = EventTimeUnit::kUS) const;
template<typename T>
const T GetFinishedAt(EventTimeUnit time_unit = EventTimeUnit::kUS) const;
protected:
virtual void SetStartedAt(double t);
virtual void SetFinishedAt(double t);
std::string name_;
EventTimeUnit time_unit_;
double started_at_ = 0;
double finished_at_ = 0;
};
inline double ConvertTime(double time_, EventTimeUnit src_time_unit, EventTimeUnit dst_time_unit) {
if (src_time_unit == EventTimeUnit::kNS && dst_time_unit == EventTimeUnit::kUS) {
return time_ / 1000;
}
if (src_time_unit == EventTimeUnit::kUS && dst_time_unit == EventTimeUnit::kNS) {
return time_ * 1000;
}
return time_;
}
template<>
const inline double IEvent::GetStartedAt<double>(EventTimeUnit time_unit) const {
return ConvertTime(started_at_, time_unit_, time_unit);
}
template<>
const inline time_t IEvent::GetStartedAt<time_t>(EventTimeUnit time_unit) const {
return static_cast<time_t>(GetStartedAt<double>(time_unit));
}
template<>
const inline double IEvent::GetFinishedAt<double>(EventTimeUnit time_unit) const {
return ConvertTime(finished_at_, time_unit_, time_unit);
}
template<>
const inline time_t IEvent::GetFinishedAt<time_t>(EventTimeUnit time_unit) const {
return static_cast<time_t>(GetFinishedAt<double>(time_unit));
}
template<>
const inline double IEvent::GetDuration<double>(EventTimeUnit time_unit) const {
return GetFinishedAt<double>(time_unit) - GetStartedAt<double>(time_unit);
}
template<>
const inline time_t IEvent::GetDuration<time_t>(EventTimeUnit time_unit) const {
return static_cast<time_t>(GetDuration<double>(time_unit));
}
class CustomEvent final : public IEvent {
public:
friend class ProfileManager;
std::string Key() override;
nlohmann::json ToJson() override;
static std::shared_ptr<CustomEvent> Create(const std::string& name,
CustomEventType type = CustomEventType::kDefault);
private:
CustomEventType type_;
CustomEvent(const std::string& custom_name, CustomEventType type)
: IEvent(custom_name,
type == CustomEventType::kDefault ? EventTimeUnit::kNS : EventTimeUnit::kUS),
type_(type) {}
};
class KernelEvent final : public IEvent {
public:
std::string Key() override;
nlohmann::json ToJson() override;
static std::shared_ptr<KernelEvent> Create(
const std::string& name, const std::function<std::vector<ShapeView>(void)>& shape_getter);
void RecordShape(const ShapeView& shape);
#if defined(WITH_CUDA)
void SetMemorySize(int64_t memory_size) { memory_size_ = memory_size; }
void AddChildEvent(const std::shared_ptr<IEvent>& e) { children_.emplace(e); }
bool AddChildEventIfSo(const std::shared_ptr<IEvent>& e) {
if (e->IsChildOf(dynamic_cast<IEvent*>(this))) {
children_.emplace(e);
return true;
}
return false;
}
bool HasChildEvent(const std::shared_ptr<IEvent>& e) { return children_.count(e); }
void WalkAmongChildren(const std::function<void(const std::shared_ptr<IEvent>& e)>& f) const {
for (const auto& x : children_) { f(x); }
}
#endif // WITH_CUDA
private:
KernelEvent(const std::string& kernel_name,
const std::function<std::vector<ShapeView>(void)>& shape_getter)
: IEvent(kernel_name, EventTimeUnit::kNS) {
if (shape_getter) { input_shapes_ = shape_getter(); }
}
#if defined(WITH_CUDA)
int64_t memory_size_ = -1;
std::set<std::shared_ptr<IEvent>> children_;
#endif // WITH_CUDA
std::vector<ShapeView> input_shapes_;
std::string GetFormatedInputShapes(size_t max_num_to_format = 4);
};
} // namespace profiler
} // namespace oneflow
namespace nlohmann {
inline void to_json(json& j, const std::shared_ptr<::oneflow::profiler::IEvent>& event) {
j = event->ToJson();
}
} // namespace nlohmann
#endif // ONEFLOW_CORE_PROFILER_EVENT_H_
/*
Copyright 2020 The OneFlow Authors. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
#ifndef ONEFLOW_CORE_PROFILER_EVENT_H_
#define ONEFLOW_CORE_PROFILER_EVENT_H_
#include <functional>
#include <memory>
#include <vector>
#include "nlohmann/json.hpp"
#include "oneflow/core/common/util.h"
#include "oneflow/core/common/shape_view.h"
namespace oneflow {
namespace profiler {
class ProfileManager;
enum class EventType {
kCustom, // has three kinds
kOneflowKernel // OneFlow cpu/cuda kernel
};
enum class CustomEventType {
kDefault, // for record_function
kCudaKernel, // cuda kernel
kCudaRuntime // something like cudaLaunchKernel
};
enum class EventTimeUnit { kNS, kUS };
class IEvent {
public:
OF_DISALLOW_COPY_AND_MOVE(IEvent);
IEvent() = delete;
IEvent(const std::string& name, EventTimeUnit time_unit) : name_(name), time_unit_(time_unit) {}
virtual std::string Key() = 0;
virtual nlohmann::json ToJson();
virtual ~IEvent() = default;
virtual void Start();
virtual void Finish();
bool IsChildOf(const IEvent* e);
const std::string& GetName() const;
template<typename T>
const T GetDuration(EventTimeUnit time_unit = EventTimeUnit::kUS) const;
template<typename T>
const T GetStartedAt(EventTimeUnit time_unit = EventTimeUnit::kUS) const;
template<typename T>
const T GetFinishedAt(EventTimeUnit time_unit = EventTimeUnit::kUS) const;
protected:
virtual void SetStartedAt(double t);
virtual void SetFinishedAt(double t);
std::string name_;
EventTimeUnit time_unit_;
double started_at_ = 0;
double finished_at_ = 0;
};
inline double ConvertTime(double time_, EventTimeUnit src_time_unit, EventTimeUnit dst_time_unit) {
if (src_time_unit == EventTimeUnit::kNS && dst_time_unit == EventTimeUnit::kUS) {
return time_ / 1000;
}
if (src_time_unit == EventTimeUnit::kUS && dst_time_unit == EventTimeUnit::kNS) {
return time_ * 1000;
}
return time_;
}
template<>
const inline double IEvent::GetStartedAt<double>(EventTimeUnit time_unit) const {
return ConvertTime(started_at_, time_unit_, time_unit);
}
template<>
const inline time_t IEvent::GetStartedAt<time_t>(EventTimeUnit time_unit) const {
return static_cast<time_t>(GetStartedAt<double>(time_unit));
}
template<>
const inline double IEvent::GetFinishedAt<double>(EventTimeUnit time_unit) const {
return ConvertTime(finished_at_, time_unit_, time_unit);
}
template<>
const inline time_t IEvent::GetFinishedAt<time_t>(EventTimeUnit time_unit) const {
return static_cast<time_t>(GetFinishedAt<double>(time_unit));
}
template<>
const inline double IEvent::GetDuration<double>(EventTimeUnit time_unit) const {
return GetFinishedAt<double>(time_unit) - GetStartedAt<double>(time_unit);
}
template<>
const inline time_t IEvent::GetDuration<time_t>(EventTimeUnit time_unit) const {
return static_cast<time_t>(GetDuration<double>(time_unit));
}
class CustomEvent final : public IEvent {
public:
friend class ProfileManager;
std::string Key() override;
nlohmann::json ToJson() override;
static std::shared_ptr<CustomEvent> Create(const std::string& name,
CustomEventType type = CustomEventType::kDefault);
private:
CustomEventType type_;
CustomEvent(const std::string& custom_name, CustomEventType type)
: IEvent(custom_name,
type == CustomEventType::kDefault ? EventTimeUnit::kNS : EventTimeUnit::kUS),
type_(type) {}
};
class KernelEvent final : public IEvent {
public:
std::string Key() override;
nlohmann::json ToJson() override;
static std::shared_ptr<KernelEvent> Create(
const std::string& name, const std::function<std::vector<Shape>(void)>& shape_getter);
#if defined(WITH_CUDA) || defined(WITH_ROCM)
void SetMemorySize(int64_t memory_size) { memory_size_ = memory_size; }
void AddChildEvent(const std::shared_ptr<IEvent>& e) { children_.emplace(e); }
bool AddChildEventIfSo(const std::shared_ptr<IEvent>& e) {
if (e->IsChildOf(dynamic_cast<IEvent*>(this))) {
children_.emplace(e);
return true;
}
return false;
}
bool HasChildEvent(const std::shared_ptr<IEvent>& e) { return children_.count(e); }
void WalkAmongChildren(const std::function<void(const std::shared_ptr<IEvent>& e)>& f) const {
for (const auto& x : children_) { f(x); }
}
#endif // WITH_CUDA
private:
KernelEvent(const std::string& kernel_name,
const std::function<std::vector<Shape>(void)>& shape_getter)
: IEvent(kernel_name, EventTimeUnit::kNS) {
if (shape_getter) { input_shapes_ = shape_getter(); }
}
#if defined(WITH_CUDA) || defined(WITH_ROCM)
int64_t memory_size_ = -1;
std::set<std::shared_ptr<IEvent>> children_;
#endif // WITH_CUDA
std::vector<Shape> input_shapes_;
std::string GetFormatedInputShapes(size_t max_num_to_format = 4);
};
} // namespace profiler
} // namespace oneflow
namespace nlohmann {
inline void to_json(json& j, const std::shared_ptr<::oneflow::profiler::IEvent>& event) {
j = event->ToJson();
}
} // namespace nlohmann
#endif // ONEFLOW_CORE_PROFILER_EVENT_H_
......@@ -32,13 +32,13 @@ std::shared_ptr<EventRecorder> EventRecorder::CreateCustomEventRecorder(const st
Maybe<EventRecorder> EventRecorder::CreateKernelEventRecorder(
const std::string& name,
#if defined(WITH_CUDA)
#if defined(WITH_CUDA) || defined(WITH_ROCM)
const std::function<int64_t()>& memory_size_getter,
#endif
const ShapeGetterFuncType& shape_getter) {
auto pmgr = Singleton<ProfileManager>::Get();
if (pmgr) {
#if defined(WITH_CUDA)
#if defined(WITH_CUDA) || defined(WITH_ROCM)
if (pmgr->use_cpu_ || pmgr->use_cuda_) {
auto event = KernelEvent::Create(name, pmgr->record_shapes_ ? shape_getter : nullptr);
if (pmgr->use_cuda_) {
......
/*
Copyright 2020 The OneFlow Authors. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
#ifndef ONEFLOW_CORE_PROFILER_EVENT_RECORDER_H_
#define ONEFLOW_CORE_PROFILER_EVENT_RECORDER_H_
#include "oneflow/core/common/util.h"
#include "oneflow/core/profiler/event.h"
namespace oneflow {
namespace profiler {
class EventRecorder {
public:
using ShapeGetterFuncType = std::function<std::vector<ShapeView>(void)>;
OF_DISALLOW_COPY_AND_MOVE(EventRecorder);
explicit EventRecorder(const std::shared_ptr<IEvent>& event) : event_(event) {
CHECK_JUST(RegisterEventToProfileManager(event));
event_->Start();
}
Maybe<void> RegisterEventToProfileManager(const std::shared_ptr<IEvent>& event);
~EventRecorder() {
if (event_) {
event_->Finish();
event_.reset();
}
}
static std::shared_ptr<EventRecorder> CreateCustomEventRecorder(const std::string& name);
static Maybe<EventRecorder> CreateKernelEventRecorder(
const std::string& name,
#if defined(WITH_CUDA)
const std::function<int64_t()>& memory_size_getter,
#endif
const ShapeGetterFuncType& shape_getter);
private:
std::shared_ptr<IEvent> event_;
};
} // namespace profiler
} // namespace oneflow
#endif // ONEFLOW_CORE_PROFILER_EVENT_RECORDER_H_
/*
Copyright 2020 The OneFlow Authors. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
#ifndef ONEFLOW_CORE_PROFILER_EVENT_RECORDER_H_
#define ONEFLOW_CORE_PROFILER_EVENT_RECORDER_H_
#include "oneflow/core/common/util.h"
#include "oneflow/core/profiler/event.h"
namespace oneflow {
namespace profiler {
class EventRecorder {
public:
using ShapeGetterFuncType = std::function<std::vector<Shape>(void)>;
OF_DISALLOW_COPY_AND_MOVE(EventRecorder);
explicit EventRecorder(const std::shared_ptr<IEvent>& event) : event_(event) {
CHECK_JUST(RegisterEventToProfileManager(event));
event_->Start();
}
Maybe<void> RegisterEventToProfileManager(const std::shared_ptr<IEvent>& event);
~EventRecorder() {
if (event_) {
event_->Finish();
event_.reset();
}
}
static std::shared_ptr<EventRecorder> CreateCustomEventRecorder(const std::string& name);
static Maybe<EventRecorder> CreateKernelEventRecorder(
const std::string& name,
#if defined(WITH_CUDA) || defined(WITH_ROCM)
const std::function<int64_t()>& memory_size_getter,
#endif
const ShapeGetterFuncType& shape_getter);
private:
std::shared_ptr<IEvent> event_;
};
} // namespace profiler
} // namespace oneflow
#endif // ONEFLOW_CORE_PROFILER_EVENT_RECORDER_H_
......@@ -17,7 +17,11 @@ limitations under the License.
#include "oneflow/core/profiler/kernel.h"
#include "oneflow/core/profiler/profiler.h"
#include "oneflow/core/kernel/kernel.h"
#ifdef WITH_ROCM
#include "oneflow/core/ep/rocm/cuda_stream.h"
#else
#include "oneflow/core/ep/cuda/cuda_stream.h"
#endif
#include "oneflow/core/lazy/actor/actor_context.h"
namespace oneflow {
......@@ -43,6 +47,11 @@ thread_local cudaEvent_t cuda_memory_bandwidth_profile_start_event = nullptr;
thread_local cudaEvent_t cuda_memory_bandwidth_profile_end_event = nullptr;
#endif // WITH_CUDA
#if defined(WITH_ROCM)
thread_local hipEvent_t cuda_memory_bandwidth_profile_start_event = nullptr;
thread_local hipEvent_t cuda_memory_bandwidth_profile_end_event = nullptr;
#endif // WITH_ROCM
} // namespace
void TraceKernelForwardDataContentStart(KernelContext* kernel_ctx, const Kernel* kernel) {
......@@ -61,6 +70,22 @@ void TraceKernelForwardDataContentStart(KernelContext* kernel_ctx, const Kernel*
}
if (profile_kernel_forward_range) { OF_PROFILER_RANGE_PUSH(kernel->op_conf().name()); }
#endif // WITH_CUDA
#if defined(WITH_ROCM)
if (profile_cuda_memory_bandwidth) {
auto* actor_context_provider = dynamic_cast<ActorContextProvider*>(kernel_ctx);
auto* cuda_stream = dynamic_cast<ep::CudaStream*>(kernel_ctx->stream());
if (cuda_stream != nullptr && actor_context_provider != nullptr) {
CHECK(cuda_memory_bandwidth_profile_start_event == nullptr);
CHECK(cuda_memory_bandwidth_profile_end_event == nullptr);
OF_CUDA_CHECK(hipEventCreate(&cuda_memory_bandwidth_profile_start_event));
OF_CUDA_CHECK(hipEventCreate(&cuda_memory_bandwidth_profile_end_event));
OF_CUDA_CHECK(
hipEventRecord(cuda_memory_bandwidth_profile_start_event, cuda_stream->cuda_stream()));
}
}
if (profile_kernel_forward_range) { OF_PROFILER_RANGE_PUSH(kernel->op_conf().name()); }
#endif // WITH_ROCM
}
void TraceKernelForwardDataContentEnd(KernelContext* kernel_ctx, const Kernel* kernel) {
......@@ -103,6 +128,45 @@ void TraceKernelForwardDataContentEnd(KernelContext* kernel_ctx, const Kernel* k
}
}
#endif // WITH_CUDA
#if defined(WITH_ROCM)
if (profile_kernel_forward_range) { OF_PROFILER_RANGE_POP(); }
// The memory bandwidth profiler only works in lazy mode.
if (profile_cuda_memory_bandwidth) {
auto* cuda_stream = dynamic_cast<ep::CudaStream*>(kernel_ctx->stream());
auto* actor_context_provider = dynamic_cast<ActorContextProvider*>(kernel_ctx);
if (cuda_stream != nullptr && actor_context_provider != nullptr) {
hipEvent_t start_event = cuda_memory_bandwidth_profile_start_event;
hipEvent_t end_event = cuda_memory_bandwidth_profile_end_event;
cuda_memory_bandwidth_profile_start_event = nullptr;
cuda_memory_bandwidth_profile_end_event = nullptr;
CHECK_NOTNULL(start_event);
CHECK_NOTNULL(end_event);
OF_CUDA_CHECK(hipEventRecord(end_event, cuda_stream->cuda_stream()));
int64_t memory_size = 0;
for (const auto& bn : kernel->op_attribute().input_bns()) {
const Blob* blob = kernel_ctx->BnInOp2Blob(bn);
if (blob) { memory_size += blob->ByteSizeOfBlobBody(); }
}
for (const auto& bn : kernel->op_attribute().output_bns()) {
const Blob* blob = kernel_ctx->BnInOp2Blob(bn);
if (blob) { memory_size += blob->ByteSizeOfBlobBody(); }
}
const std::string op_name = kernel->op_conf().name();
actor_context_provider->GetActorContext()->AddCallback(
[start_event, end_event, memory_size, op_name]() {
float elapsed_ms = 0;
OF_CUDA_CHECK(hipEventElapsedTime(&elapsed_ms, start_event, end_event));
OF_CUDA_CHECK(hipEventDestroy(start_event));
OF_CUDA_CHECK(hipEventDestroy(end_event));
double bandwidth =
static_cast<double>(memory_size) / (1024.0 * 1024.0 * 1024.0) / (elapsed_ms / 1000);
LOG(INFO) << "PROFILER::KERNEL::CUDA_MEMORY_BANDWIDTH op_name: " << op_name
<< " elapsed(ms): " << elapsed_ms << " memory_size(Byte): " << memory_size
<< " bandwidth(GB/s): " << bandwidth;
});
}
}
#endif // WITH_ROCM
}
} // namespace profiler
......
......@@ -14,7 +14,7 @@ See the License for the specific language governing permissions and
limitations under the License.
*/
#if defined(WITH_CUDA)
#if defined(WITH_CUDA) || defined(WITH_ROCM)
#include "oneflow/core/profiler/kineto_shim.h"
#include "libkineto.h"
......
......@@ -16,7 +16,7 @@ limitations under the License.
#ifndef ONEFLOW_CORE_PROFILER_KINETO_SHIM_H_
#define ONEFLOW_CORE_PROFILER_KINETO_SHIM_H_
#if defined(WITH_CUDA)
#if defined(WITH_CUDA) || defined(WITH_ROCM)
#include <string>
#include <memory>
......
......@@ -15,12 +15,12 @@ limitations under the License.
*/
#include <memory>
#include <unordered_map>
// #include "fmt/core.h"
#include "fmt/core.h"
#include "nlohmann/json.hpp"
#include "oneflow/core/profiler/kineto_shim.h"
#include "oneflow/core/profiler/profile_manager.h"
#include "oneflow/core/profiler/event.h"
#if defined(WITH_CUDA)
#if defined(WITH_CUDA) || defined(WITH_ROCM)
#include <libkineto.h>
#endif // WITH_CUDA
......@@ -48,7 +48,7 @@ std::string ProfileManager::DumpResultsJson() {
}
std::vector<std::shared_ptr<IEvent>> ProfileManager::ExportEvents() {
#if defined(WITH_CUDA)
#if defined(WITH_CUDA) || defined(WITH_ROCM)
auto trace = StopTrace();
const auto& kineto_events = *(trace.get()->activities());
std::set<std::shared_ptr<IEvent>> custom_events;
......@@ -77,7 +77,7 @@ std::vector<std::shared_ptr<IEvent>> ProfileManager::ExportEvents() {
while (!events_.empty()) {
auto evt = events_.front();
events_.pop();
#if defined(WITH_CUDA)
#if defined(WITH_CUDA) || defined(WITH_ROCM)
auto evt_kernel = std::dynamic_pointer_cast<KernelEvent>(evt);
if (evt_kernel) {
std::set<int64_t> current_corr_ids;
......@@ -106,8 +106,7 @@ std::string ProfileManager::GetNextEventRecorderKey(const std::string& name) {
} else {
event_recorders_last_id_[name]++;
}
// return fmt::format("{}.{}", name, event_recorders_last_id_[name]);
return "yuguo";
return fmt::format("{}.{}", name, event_recorders_last_id_[name]);
}
} // namespace profiler
......
......@@ -37,7 +37,7 @@ class ProfileManager {
use_cuda_(use_cuda),
record_shapes_(record_shapes),
record_bandwidth_(record_bandwidth) {
#if defined(WITH_CUDA)
#if defined(WITH_CUDA) || defined(WITH_ROCM)
std::set<ActivityType> activities{};
if (use_cpu) { activities.insert(ActivityType::CPU); }
if (use_cuda) { activities.insert(ActivityType::CUDA); }
......
......@@ -20,11 +20,20 @@ limitations under the License.
#include "oneflow/core/profiler/event_recorder.h"
#include "oneflow/core/vm/vm_util.h"
#ifdef OF_ENABLE_PROFILER
#ifdef WITH_ROCM
#include <hip/hip_runtime.h>
#include <hip/hip_profile.h>
#include <roctracer_roctx.h>
#include <sys/syscall.h>
#include <iostream>
#include "oneflow/core/device/cuda_util.h"
#else
#include <nvtx3/nvToolsExt.h>
#include <sys/syscall.h>
#include <iostream>
#include <cuda_profiler_api.h>
#include "oneflow/core/device/cuda_util.h"
#endif
#endif // OF_ENABLE_PROFILER
namespace oneflow {
......@@ -33,6 +42,16 @@ namespace profiler {
void NameThisHostThread(const std::string& name) {
#ifdef OF_ENABLE_PROFILER
#ifdef WITH_ROCM
static thread_local std::unique_ptr<std::string> thread_name_prefix;
if (!thread_name_prefix) {
thread_name_prefix.reset(
new std::string(GetStringFromEnv("ONEFLOW_PROFILER_HOST_THREAD_NAME_PREFIX", "")));
}
const std::string name_with_prefix = *thread_name_prefix + name;
// nvtxNameOsThreadA(syscall(SYS_gettid), name_with_prefix.c_str());
roctxMarkA(name_with_prefix.c_str());
#else
static thread_local std::unique_ptr<std::string> thread_name_prefix;
if (!thread_name_prefix) {
thread_name_prefix.reset(
......@@ -40,18 +59,27 @@ void NameThisHostThread(const std::string& name) {
}
const std::string name_with_prefix = *thread_name_prefix + name;
nvtxNameOsThreadA(syscall(SYS_gettid), name_with_prefix.c_str());
#endif
#endif // OF_ENABLE_PROFILER
}
void RangePush(const std::string& name) {
#ifdef OF_ENABLE_PROFILER
#ifdef WITH_ROCM
roctxRangePushA(name.c_str());
#else
nvtxRangePushA(name.c_str());
#endif
#endif // OF_ENABLE_PROFILER
}
void RangePop() {
#ifdef OF_ENABLE_PROFILER
#ifdef WITH_ROCM
roctxRangePop();
#else
nvtxRangePop();
#endif
#endif // OF_ENABLE_PROFILER
}
......@@ -82,13 +110,21 @@ void LogHostMemoryUsage(const std::string& name) {
void ProfilerStart() {
#ifdef OF_ENABLE_PROFILER
#ifdef WITH_ROCM
OF_CUDA_CHECK(hipProfilerStart());
#else
OF_CUDA_CHECK(cudaProfilerStart());
#endif
#endif // OF_ENABLE_PROFILER
}
void ProfilerStop() {
#ifdef OF_ENABLE_PROFILER
#ifdef WITH_ROCM
OF_CUDA_CHECK(hipProfilerStop());
#else
OF_CUDA_CHECK(cudaProfilerStop());
#endif
#endif // OF_ENABLE_PROFILER
}
......@@ -105,6 +141,9 @@ Maybe<std::string> DisableProfilerAndReturnResult() {
#if defined(WITH_CUDA)
OF_CUDA_CHECK(cudaDeviceSynchronize());
#endif // WITH_CUDA
#if defined(WITH_ROCM)
OF_CUDA_CHECK(hipDeviceSynchronize());
#endif // WITH_ROCM
auto* pmgr = JUST(SingletonMaybe<ProfileManager>());
std::string results = pmgr->DumpResultsJson();
Singleton<ProfileManager>::Delete();
......
/*
Copyright 2020 The OneFlow Authors. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
#include "oneflow/core/framework/framework.h"
#include "oneflow/core/kernel/new_kernel_util.h"
#ifdef OF_ENABLE_PROFILER
#include <roctracer_roctx.h>
#endif // OF_ENABLE_PROFILER
namespace oneflow {
namespace {
#ifdef OF_ENABLE_PROFILER
static thread_local HashMap<std::string, roctx_range_id_t> mark2range_id;
#endif
} // namespace
class NvtxOpKernelState final : public user_op::OpKernelState {
public:
NvtxOpKernelState() : counter_(0) {
#ifndef OF_ENABLE_PROFILER
LOG(WARNING) << "To use NVTX, run cmake with -DBUILD_PROFILER=ON";
#endif
}
~NvtxOpKernelState() override = default;
int64_t counter() const { return counter_; }
void IncreaseCount() { counter_ += 1; }
private:
int64_t counter_;
};
class NvtxStartKernel final : public user_op::OpKernel {
public:
NvtxStartKernel() = default;
~NvtxStartKernel() override = default;
std::shared_ptr<user_op::OpKernelState> CreateOpKernelState(
user_op::KernelInitContext* ctx) const override {
return std::make_shared<NvtxOpKernelState>();
}
private:
using user_op::OpKernel::Compute;
void Compute(user_op::KernelComputeContext* ctx, user_op::OpKernelState* state,
const user_op::OpKernelCache*) const override {
const user_op::Tensor* in = ctx->Tensor4ArgNameAndIndex("in", 0);
user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0);
const ShapeView& in_shape = in->shape_view();
CHECK_EQ(out->shape_view(), in_shape);
const DataType in_data_type = in->data_type();
CHECK_EQ(out->data_type(), in_data_type);
Memcpy<DeviceType::kCUDA>(ctx->stream(), out->mut_dptr<void>(), in->dptr<void>(),
in_shape.elem_cnt() * GetSizeOfDataType(in_data_type));
#ifdef OF_ENABLE_PROFILER
auto* kernel_state = dynamic_cast<NvtxOpKernelState*>(state);
const std::string mark_prefix = ctx->Attr<std::string>("mark_prefix");
const std::string mark = mark_prefix + "-" + std::to_string(kernel_state->counter());
roctx_range_id_t range_id = roctxRangeStartA(mark.c_str());
CHECK(mark2range_id.emplace(mark, range_id).second);
kernel_state->IncreaseCount();
#endif // OF_ENABLE_PROFILER
}
bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
};
REGISTER_USER_KERNEL("nvtx_start")
.SetCreateFn<NvtxStartKernel>()
.SetIsMatchedHob(user_op::HobDeviceType() == DeviceType::kCUDA)
.SetInplaceProposalFn([](const user_op::InferContext&,
user_op::AddInplaceArgPair AddInplaceArgPairFn) -> Maybe<void> {
OF_RETURN_IF_ERROR(AddInplaceArgPairFn("out", 0, "in", 0, false));
return Maybe<void>::Ok();
});
class NvtxEndKernel final : public user_op::OpKernel {
public:
NvtxEndKernel() = default;
~NvtxEndKernel() override = default;
std::shared_ptr<user_op::OpKernelState> CreateOpKernelState(
user_op::KernelInitContext* ctx) const override {
return std::make_shared<NvtxOpKernelState>();
}
private:
using user_op::OpKernel::Compute;
void Compute(user_op::KernelComputeContext* ctx, user_op::OpKernelState* state,
const user_op::OpKernelCache*) const override {
const user_op::Tensor* in = ctx->Tensor4ArgNameAndIndex("in", 0);
user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0);
const ShapeView& in_shape = in->shape_view();
CHECK_EQ(out->shape_view(), in_shape);
const DataType in_data_type = in->data_type();
CHECK_EQ(out->data_type(), in_data_type);
#ifdef OF_ENABLE_PROFILER
auto* kernel_state = dynamic_cast<NvtxOpKernelState*>(state);
const std::string mark_prefix = ctx->Attr<std::string>("mark_prefix");
const std::string mark = mark_prefix + "-" + std::to_string(kernel_state->counter());
auto it = mark2range_id.find(mark.c_str());
CHECK(it != mark2range_id.end());
roctx_range_id_t range_id = it->second;
mark2range_id.erase(it);
roctxRangeStop(range_id);
Memcpy<DeviceType::kCUDA>(ctx->stream(), out->mut_dptr<void>(), in->dptr<void>(),
in_shape.elem_cnt() * GetSizeOfDataType(in_data_type));
kernel_state->IncreaseCount();
#endif
}
bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
};
REGISTER_USER_KERNEL("nvtx_end")
.SetCreateFn<NvtxEndKernel>()
.SetIsMatchedHob(user_op::HobDeviceType() == DeviceType::kCUDA)
.SetInplaceProposalFn([](const user_op::InferContext&,
user_op::AddInplaceArgPair AddInplaceArgPairFn) -> Maybe<void> {
OF_RETURN_IF_ERROR(AddInplaceArgPairFn("out", 0, "in", 0, false));
return Maybe<void>::Ok();
});
} // namespace oneflow
This diff is collapsed.
import numpy as np
import oneflow as flow
def fused_dot_feature_interaction(x,
y,
self_interaction=False,
output_padding=0,
output_concat=None,
dtype=flow.float32
):
# (bs, es) = x.shape
(bs, dims, es) = y.shape
if self_interaction:
offset = 1
else:
offset = 0
li = flow.tensor([i for i in range(dims + 1) for j in range(i + offset)])
lj = flow.tensor([j for i in range(dims + 1) for j in range(i + offset)])
T = flow.cat(
[
flow.reshape(x, (bs, 1, es)),
y,
],
dim=1,
)
Z = flow.matmul(T, T, transpose_b=True)
# gather_nd not support half, so cast to float32
Z = flow.cast(Z, flow.float32)
Zflat = Z[:, li, lj]
Zflat = flow.cast(Zflat, dtype)
if output_concat is not None:
R = flow.cat([output_concat, Zflat], dim=1)
else:
R = Zflat
if output_padding != 0:
padding_tensor = flow.tensor(
np.zeros((bs, output_padding)).astype(np.float32),
device="cuda",
requires_grad=False,
)
R = flow.cat([R, padding_tensor], dim=1)
return R
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment