"vscode:/vscode.git/clone" did not exist on "6880673115182c7e26f96257ff81f5d7985d18ce"
Commit f262efc9 authored by yuguo's avatar yuguo
Browse files

Surpport profiler for DCU, surpport debug compiler

parent 3f56062c
......@@ -265,9 +265,9 @@ set(ROBIN_HOOD_HASHING_URL
use_mirror(VARIABLE ROBIN_HOOD_HASHING_URL URL ${ROBIN_HOOD_HASHING_URL})
set(ROBIN_HOOD_HASHING_MD5 a78bd30a7582f25984f8592652836467)
set(FMT_URL https://github.com/fmtlib/fmt/archive/48b7e3dafb27ece02cd6addc8bd1041c79d59c2c.zip)
set(FMT_URL https://github.com/fmtlib/fmt/archive/fc07217d85e6dcec52878807d6bbd89a9d9156a5.zip)
use_mirror(VARIABLE FMT_URL URL ${FMT_URL})
set(FMT_MD5 45925a979ed7195e0c88a70be691de09)
set(FMT_MD5 7d9bb2ececc9ede29cd35bdc42a7e22c)
set(KINETO_URL
https://github.com/pytorch/kineto/archive/ff8dba20499a660650632952be76450bd70a52a6.zip)
......
......@@ -175,6 +175,8 @@ if (BUILD_ROCM)
add_definitions(-D__HIP_PLATFORM_HCC__)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D__HIP_PLATFORM_HCC__ --gpu-max-threads-per-block=1024")
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -D__HIP_PLATFORM_HCC__ --gpu-max-threads-per-block=1024")
set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -mcmodel=large")
set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} -mcmodel=large")
list(APPEND oneflow_third_party_libs hip::device)
list(APPEND oneflow_third_party_libs roc::hipblas)
list(APPEND oneflow_third_party_libs hip::hipcub)
......
......@@ -14,8 +14,8 @@ See the License for the specific language governing permissions and
limitations under the License.
*/
// #include "fmt/core.h"
// #include "fmt/format.h"
#include "fmt/core.h"
#include "fmt/format.h"
#include "oneflow/core/profiler/event.h"
#include "oneflow/core/profiler/util.h"
......@@ -58,14 +58,13 @@ std::shared_ptr<CustomEvent> CustomEvent::Create(const std::string& name, Custom
return std::shared_ptr<CustomEvent>(new CustomEvent(name, type));
}
// std::string KernelEvent::Key() { return fmt::format("{}.{}", name_, GetFormatedInputShapes()); }
std::string KernelEvent::Key() { return "yuguo"; }
std::string KernelEvent::Key() { return fmt::format("{}.{}", name_, GetFormatedInputShapes()); }
nlohmann::json KernelEvent::ToJson() {
auto j = IEvent::ToJson();
j["type"] = EventType::kOneflowKernel;
j["input_shapes"] = GetFormatedInputShapes();
#if defined(WITH_CUDA)
#if defined(WITH_CUDA) || defined(WITH_ROCM)
j["memory_size"] = memory_size_;
if (!children_.empty()) { j["children"] = children_; }
#endif // WITH_CUDA
......@@ -73,12 +72,10 @@ nlohmann::json KernelEvent::ToJson() {
}
std::shared_ptr<KernelEvent> KernelEvent::Create(
const std::string& name, const std::function<std::vector<ShapeView>(void)>& shape_getter) {
const std::string& name, const std::function<std::vector<Shape>(void)>& shape_getter) {
return std::shared_ptr<KernelEvent>(new KernelEvent(name, shape_getter));
}
void KernelEvent::RecordShape(const ShapeView& shape) { input_shapes_.emplace_back(shape); }
std::string KernelEvent::GetFormatedInputShapes(size_t max_num_to_format) {
if (input_shapes_.size() == 0) { return "-"; }
std::vector<std::string> shapes_formated(std::min(input_shapes_.size(), max_num_to_format));
......@@ -87,8 +84,7 @@ std::string KernelEvent::GetFormatedInputShapes(size_t max_num_to_format) {
shapes_formated[i] = current_shape == "()" ? "scalar" : current_shape;
}
if (input_shapes_.size() > max_num_to_format) { shapes_formated.emplace_back("..."); }
// return fmt::format("[{}]", fmt::join(shapes_formated, ", "));
return "yuguo";
return fmt::format("[{}]", fmt::join(shapes_formated, ", "));
}
} // namespace profiler
......
......@@ -138,11 +138,9 @@ class KernelEvent final : public IEvent {
nlohmann::json ToJson() override;
static std::shared_ptr<KernelEvent> Create(
const std::string& name, const std::function<std::vector<ShapeView>(void)>& shape_getter);
const std::string& name, const std::function<std::vector<Shape>(void)>& shape_getter);
void RecordShape(const ShapeView& shape);
#if defined(WITH_CUDA)
#if defined(WITH_CUDA) || defined(WITH_ROCM)
void SetMemorySize(int64_t memory_size) { memory_size_ = memory_size; }
void AddChildEvent(const std::shared_ptr<IEvent>& e) { children_.emplace(e); }
bool AddChildEventIfSo(const std::shared_ptr<IEvent>& e) {
......@@ -160,17 +158,17 @@ class KernelEvent final : public IEvent {
private:
KernelEvent(const std::string& kernel_name,
const std::function<std::vector<ShapeView>(void)>& shape_getter)
const std::function<std::vector<Shape>(void)>& shape_getter)
: IEvent(kernel_name, EventTimeUnit::kNS) {
if (shape_getter) { input_shapes_ = shape_getter(); }
}
#if defined(WITH_CUDA)
#if defined(WITH_CUDA) || defined(WITH_ROCM)
int64_t memory_size_ = -1;
std::set<std::shared_ptr<IEvent>> children_;
#endif // WITH_CUDA
std::vector<ShapeView> input_shapes_;
std::vector<Shape> input_shapes_;
std::string GetFormatedInputShapes(size_t max_num_to_format = 4);
};
......
......@@ -32,13 +32,13 @@ std::shared_ptr<EventRecorder> EventRecorder::CreateCustomEventRecorder(const st
Maybe<EventRecorder> EventRecorder::CreateKernelEventRecorder(
const std::string& name,
#if defined(WITH_CUDA)
#if defined(WITH_CUDA) || defined(WITH_ROCM)
const std::function<int64_t()>& memory_size_getter,
#endif
const ShapeGetterFuncType& shape_getter) {
auto pmgr = Singleton<ProfileManager>::Get();
if (pmgr) {
#if defined(WITH_CUDA)
#if defined(WITH_CUDA) || defined(WITH_ROCM)
if (pmgr->use_cpu_ || pmgr->use_cuda_) {
auto event = KernelEvent::Create(name, pmgr->record_shapes_ ? shape_getter : nullptr);
if (pmgr->use_cuda_) {
......
......@@ -24,7 +24,7 @@ namespace profiler {
class EventRecorder {
public:
using ShapeGetterFuncType = std::function<std::vector<ShapeView>(void)>;
using ShapeGetterFuncType = std::function<std::vector<Shape>(void)>;
OF_DISALLOW_COPY_AND_MOVE(EventRecorder);
......@@ -45,7 +45,7 @@ class EventRecorder {
static Maybe<EventRecorder> CreateKernelEventRecorder(
const std::string& name,
#if defined(WITH_CUDA)
#if defined(WITH_CUDA) || defined(WITH_ROCM)
const std::function<int64_t()>& memory_size_getter,
#endif
const ShapeGetterFuncType& shape_getter);
......
......@@ -17,7 +17,11 @@ limitations under the License.
#include "oneflow/core/profiler/kernel.h"
#include "oneflow/core/profiler/profiler.h"
#include "oneflow/core/kernel/kernel.h"
#ifdef WITH_ROCM
#include "oneflow/core/ep/rocm/cuda_stream.h"
#else
#include "oneflow/core/ep/cuda/cuda_stream.h"
#endif
#include "oneflow/core/lazy/actor/actor_context.h"
namespace oneflow {
......@@ -43,6 +47,11 @@ thread_local cudaEvent_t cuda_memory_bandwidth_profile_start_event = nullptr;
thread_local cudaEvent_t cuda_memory_bandwidth_profile_end_event = nullptr;
#endif // WITH_CUDA
#if defined(WITH_ROCM)
thread_local hipEvent_t cuda_memory_bandwidth_profile_start_event = nullptr;
thread_local hipEvent_t cuda_memory_bandwidth_profile_end_event = nullptr;
#endif // WITH_ROCM
} // namespace
void TraceKernelForwardDataContentStart(KernelContext* kernel_ctx, const Kernel* kernel) {
......@@ -61,6 +70,22 @@ void TraceKernelForwardDataContentStart(KernelContext* kernel_ctx, const Kernel*
}
if (profile_kernel_forward_range) { OF_PROFILER_RANGE_PUSH(kernel->op_conf().name()); }
#endif // WITH_CUDA
#if defined(WITH_ROCM)
if (profile_cuda_memory_bandwidth) {
auto* actor_context_provider = dynamic_cast<ActorContextProvider*>(kernel_ctx);
auto* cuda_stream = dynamic_cast<ep::CudaStream*>(kernel_ctx->stream());
if (cuda_stream != nullptr && actor_context_provider != nullptr) {
CHECK(cuda_memory_bandwidth_profile_start_event == nullptr);
CHECK(cuda_memory_bandwidth_profile_end_event == nullptr);
OF_CUDA_CHECK(hipEventCreate(&cuda_memory_bandwidth_profile_start_event));
OF_CUDA_CHECK(hipEventCreate(&cuda_memory_bandwidth_profile_end_event));
OF_CUDA_CHECK(
hipEventRecord(cuda_memory_bandwidth_profile_start_event, cuda_stream->cuda_stream()));
}
}
if (profile_kernel_forward_range) { OF_PROFILER_RANGE_PUSH(kernel->op_conf().name()); }
#endif // WITH_ROCM
}
void TraceKernelForwardDataContentEnd(KernelContext* kernel_ctx, const Kernel* kernel) {
......@@ -103,6 +128,45 @@ void TraceKernelForwardDataContentEnd(KernelContext* kernel_ctx, const Kernel* k
}
}
#endif // WITH_CUDA
#if defined(WITH_ROCM)
if (profile_kernel_forward_range) { OF_PROFILER_RANGE_POP(); }
// The memory bandwidth profiler only works in lazy mode.
if (profile_cuda_memory_bandwidth) {
auto* cuda_stream = dynamic_cast<ep::CudaStream*>(kernel_ctx->stream());
auto* actor_context_provider = dynamic_cast<ActorContextProvider*>(kernel_ctx);
if (cuda_stream != nullptr && actor_context_provider != nullptr) {
hipEvent_t start_event = cuda_memory_bandwidth_profile_start_event;
hipEvent_t end_event = cuda_memory_bandwidth_profile_end_event;
cuda_memory_bandwidth_profile_start_event = nullptr;
cuda_memory_bandwidth_profile_end_event = nullptr;
CHECK_NOTNULL(start_event);
CHECK_NOTNULL(end_event);
OF_CUDA_CHECK(hipEventRecord(end_event, cuda_stream->cuda_stream()));
int64_t memory_size = 0;
for (const auto& bn : kernel->op_attribute().input_bns()) {
const Blob* blob = kernel_ctx->BnInOp2Blob(bn);
if (blob) { memory_size += blob->ByteSizeOfBlobBody(); }
}
for (const auto& bn : kernel->op_attribute().output_bns()) {
const Blob* blob = kernel_ctx->BnInOp2Blob(bn);
if (blob) { memory_size += blob->ByteSizeOfBlobBody(); }
}
const std::string op_name = kernel->op_conf().name();
actor_context_provider->GetActorContext()->AddCallback(
[start_event, end_event, memory_size, op_name]() {
float elapsed_ms = 0;
OF_CUDA_CHECK(hipEventElapsedTime(&elapsed_ms, start_event, end_event));
OF_CUDA_CHECK(hipEventDestroy(start_event));
OF_CUDA_CHECK(hipEventDestroy(end_event));
double bandwidth =
static_cast<double>(memory_size) / (1024.0 * 1024.0 * 1024.0) / (elapsed_ms / 1000);
LOG(INFO) << "PROFILER::KERNEL::CUDA_MEMORY_BANDWIDTH op_name: " << op_name
<< " elapsed(ms): " << elapsed_ms << " memory_size(Byte): " << memory_size
<< " bandwidth(GB/s): " << bandwidth;
});
}
}
#endif // WITH_ROCM
}
} // namespace profiler
......
......@@ -14,7 +14,7 @@ See the License for the specific language governing permissions and
limitations under the License.
*/
#if defined(WITH_CUDA)
#if defined(WITH_CUDA) || defined(WITH_ROCM)
#include "oneflow/core/profiler/kineto_shim.h"
#include "libkineto.h"
......
......@@ -16,7 +16,7 @@ limitations under the License.
#ifndef ONEFLOW_CORE_PROFILER_KINETO_SHIM_H_
#define ONEFLOW_CORE_PROFILER_KINETO_SHIM_H_
#if defined(WITH_CUDA)
#if defined(WITH_CUDA) || defined(WITH_ROCM)
#include <string>
#include <memory>
......
......@@ -15,12 +15,12 @@ limitations under the License.
*/
#include <memory>
#include <unordered_map>
// #include "fmt/core.h"
#include "fmt/core.h"
#include "nlohmann/json.hpp"
#include "oneflow/core/profiler/kineto_shim.h"
#include "oneflow/core/profiler/profile_manager.h"
#include "oneflow/core/profiler/event.h"
#if defined(WITH_CUDA)
#if defined(WITH_CUDA) || defined(WITH_ROCM)
#include <libkineto.h>
#endif // WITH_CUDA
......@@ -48,7 +48,7 @@ std::string ProfileManager::DumpResultsJson() {
}
std::vector<std::shared_ptr<IEvent>> ProfileManager::ExportEvents() {
#if defined(WITH_CUDA)
#if defined(WITH_CUDA) || defined(WITH_ROCM)
auto trace = StopTrace();
const auto& kineto_events = *(trace.get()->activities());
std::set<std::shared_ptr<IEvent>> custom_events;
......@@ -77,7 +77,7 @@ std::vector<std::shared_ptr<IEvent>> ProfileManager::ExportEvents() {
while (!events_.empty()) {
auto evt = events_.front();
events_.pop();
#if defined(WITH_CUDA)
#if defined(WITH_CUDA) || defined(WITH_ROCM)
auto evt_kernel = std::dynamic_pointer_cast<KernelEvent>(evt);
if (evt_kernel) {
std::set<int64_t> current_corr_ids;
......@@ -106,8 +106,7 @@ std::string ProfileManager::GetNextEventRecorderKey(const std::string& name) {
} else {
event_recorders_last_id_[name]++;
}
// return fmt::format("{}.{}", name, event_recorders_last_id_[name]);
return "yuguo";
return fmt::format("{}.{}", name, event_recorders_last_id_[name]);
}
} // namespace profiler
......
......@@ -37,7 +37,7 @@ class ProfileManager {
use_cuda_(use_cuda),
record_shapes_(record_shapes),
record_bandwidth_(record_bandwidth) {
#if defined(WITH_CUDA)
#if defined(WITH_CUDA) || defined(WITH_ROCM)
std::set<ActivityType> activities{};
if (use_cpu) { activities.insert(ActivityType::CPU); }
if (use_cuda) { activities.insert(ActivityType::CUDA); }
......
......@@ -20,11 +20,20 @@ limitations under the License.
#include "oneflow/core/profiler/event_recorder.h"
#include "oneflow/core/vm/vm_util.h"
#ifdef OF_ENABLE_PROFILER
#ifdef WITH_ROCM
#include <hip/hip_runtime.h>
#include <hip/hip_profile.h>
#include <roctracer_roctx.h>
#include <sys/syscall.h>
#include <iostream>
#include "oneflow/core/device/cuda_util.h"
#else
#include <nvtx3/nvToolsExt.h>
#include <sys/syscall.h>
#include <iostream>
#include <cuda_profiler_api.h>
#include "oneflow/core/device/cuda_util.h"
#endif
#endif // OF_ENABLE_PROFILER
namespace oneflow {
......@@ -33,6 +42,16 @@ namespace profiler {
void NameThisHostThread(const std::string& name) {
#ifdef OF_ENABLE_PROFILER
#ifdef WITH_ROCM
static thread_local std::unique_ptr<std::string> thread_name_prefix;
if (!thread_name_prefix) {
thread_name_prefix.reset(
new std::string(GetStringFromEnv("ONEFLOW_PROFILER_HOST_THREAD_NAME_PREFIX", "")));
}
const std::string name_with_prefix = *thread_name_prefix + name;
// nvtxNameOsThreadA(syscall(SYS_gettid), name_with_prefix.c_str());
roctxMarkA(name_with_prefix.c_str());
#else
static thread_local std::unique_ptr<std::string> thread_name_prefix;
if (!thread_name_prefix) {
thread_name_prefix.reset(
......@@ -40,18 +59,27 @@ void NameThisHostThread(const std::string& name) {
}
const std::string name_with_prefix = *thread_name_prefix + name;
nvtxNameOsThreadA(syscall(SYS_gettid), name_with_prefix.c_str());
#endif
#endif // OF_ENABLE_PROFILER
}
void RangePush(const std::string& name) {
#ifdef OF_ENABLE_PROFILER
#ifdef WITH_ROCM
roctxRangePushA(name.c_str());
#else
nvtxRangePushA(name.c_str());
#endif
#endif // OF_ENABLE_PROFILER
}
void RangePop() {
#ifdef OF_ENABLE_PROFILER
#ifdef WITH_ROCM
roctxRangePop();
#else
nvtxRangePop();
#endif
#endif // OF_ENABLE_PROFILER
}
......@@ -82,13 +110,21 @@ void LogHostMemoryUsage(const std::string& name) {
void ProfilerStart() {
#ifdef OF_ENABLE_PROFILER
#ifdef WITH_ROCM
OF_CUDA_CHECK(hipProfilerStart());
#else
OF_CUDA_CHECK(cudaProfilerStart());
#endif
#endif // OF_ENABLE_PROFILER
}
void ProfilerStop() {
#ifdef OF_ENABLE_PROFILER
#ifdef WITH_ROCM
OF_CUDA_CHECK(hipProfilerStop());
#else
OF_CUDA_CHECK(cudaProfilerStop());
#endif
#endif // OF_ENABLE_PROFILER
}
......@@ -105,6 +141,9 @@ Maybe<std::string> DisableProfilerAndReturnResult() {
#if defined(WITH_CUDA)
OF_CUDA_CHECK(cudaDeviceSynchronize());
#endif // WITH_CUDA
#if defined(WITH_ROCM)
OF_CUDA_CHECK(hipDeviceSynchronize());
#endif // WITH_ROCM
auto* pmgr = JUST(SingletonMaybe<ProfileManager>());
std::string results = pmgr->DumpResultsJson();
Singleton<ProfileManager>::Delete();
......
......@@ -250,7 +250,7 @@ struct LgammaFunctor<float> {
static OF_DEVICE_FUNC float Backward(const float x, const float dy) {
// TODO(chengcheng): return: dy * digamma(x)
assert(false);
// assert(false);
return 0.0f;
}
};
......@@ -526,7 +526,7 @@ struct LgammaFunctor<double> {
static OF_DEVICE_FUNC double Backward(const double x, const double dy) {
// TODO(chengcheng): return: dy * digamma(x)
assert(false);
// assert(false);
return 0.0;
}
};
......@@ -817,7 +817,7 @@ struct LgammaFunctor<half> {
static OF_HALF_FUNC half Backward(const half x, const half dy) {
// TODO(chengcheng): return: dy * digamma(x)
assert(false);
// assert(false);
return GetZeroVal<half>();
}
};
......
/*
Copyright 2020 The OneFlow Authors. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
#include "oneflow/core/framework/framework.h"
#include "oneflow/core/kernel/new_kernel_util.h"
#ifdef OF_ENABLE_PROFILER
#include <roctracer_roctx.h>
#endif // OF_ENABLE_PROFILER
namespace oneflow {
namespace {
#ifdef OF_ENABLE_PROFILER
static thread_local HashMap<std::string, roctx_range_id_t> mark2range_id;
#endif
} // namespace
class NvtxOpKernelState final : public user_op::OpKernelState {
public:
NvtxOpKernelState() : counter_(0) {
#ifndef OF_ENABLE_PROFILER
LOG(WARNING) << "To use NVTX, run cmake with -DBUILD_PROFILER=ON";
#endif
}
~NvtxOpKernelState() override = default;
int64_t counter() const { return counter_; }
void IncreaseCount() { counter_ += 1; }
private:
int64_t counter_;
};
class NvtxStartKernel final : public user_op::OpKernel {
public:
NvtxStartKernel() = default;
~NvtxStartKernel() override = default;
std::shared_ptr<user_op::OpKernelState> CreateOpKernelState(
user_op::KernelInitContext* ctx) const override {
return std::make_shared<NvtxOpKernelState>();
}
private:
using user_op::OpKernel::Compute;
void Compute(user_op::KernelComputeContext* ctx, user_op::OpKernelState* state,
const user_op::OpKernelCache*) const override {
const user_op::Tensor* in = ctx->Tensor4ArgNameAndIndex("in", 0);
user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0);
const ShapeView& in_shape = in->shape_view();
CHECK_EQ(out->shape_view(), in_shape);
const DataType in_data_type = in->data_type();
CHECK_EQ(out->data_type(), in_data_type);
Memcpy<DeviceType::kCUDA>(ctx->stream(), out->mut_dptr<void>(), in->dptr<void>(),
in_shape.elem_cnt() * GetSizeOfDataType(in_data_type));
#ifdef OF_ENABLE_PROFILER
auto* kernel_state = dynamic_cast<NvtxOpKernelState*>(state);
const std::string mark_prefix = ctx->Attr<std::string>("mark_prefix");
const std::string mark = mark_prefix + "-" + std::to_string(kernel_state->counter());
roctx_range_id_t range_id = roctxRangeStartA(mark.c_str());
CHECK(mark2range_id.emplace(mark, range_id).second);
kernel_state->IncreaseCount();
#endif // OF_ENABLE_PROFILER
}
bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
};
REGISTER_USER_KERNEL("nvtx_start")
.SetCreateFn<NvtxStartKernel>()
.SetIsMatchedHob(user_op::HobDeviceType() == DeviceType::kCUDA)
.SetInplaceProposalFn([](const user_op::InferContext&,
user_op::AddInplaceArgPair AddInplaceArgPairFn) -> Maybe<void> {
OF_RETURN_IF_ERROR(AddInplaceArgPairFn("out", 0, "in", 0, false));
return Maybe<void>::Ok();
});
class NvtxEndKernel final : public user_op::OpKernel {
public:
NvtxEndKernel() = default;
~NvtxEndKernel() override = default;
std::shared_ptr<user_op::OpKernelState> CreateOpKernelState(
user_op::KernelInitContext* ctx) const override {
return std::make_shared<NvtxOpKernelState>();
}
private:
using user_op::OpKernel::Compute;
void Compute(user_op::KernelComputeContext* ctx, user_op::OpKernelState* state,
const user_op::OpKernelCache*) const override {
const user_op::Tensor* in = ctx->Tensor4ArgNameAndIndex("in", 0);
user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0);
const ShapeView& in_shape = in->shape_view();
CHECK_EQ(out->shape_view(), in_shape);
const DataType in_data_type = in->data_type();
CHECK_EQ(out->data_type(), in_data_type);
#ifdef OF_ENABLE_PROFILER
auto* kernel_state = dynamic_cast<NvtxOpKernelState*>(state);
const std::string mark_prefix = ctx->Attr<std::string>("mark_prefix");
const std::string mark = mark_prefix + "-" + std::to_string(kernel_state->counter());
auto it = mark2range_id.find(mark.c_str());
CHECK(it != mark2range_id.end());
roctx_range_id_t range_id = it->second;
mark2range_id.erase(it);
roctxRangeStop(range_id);
Memcpy<DeviceType::kCUDA>(ctx->stream(), out->mut_dptr<void>(), in->dptr<void>(),
in_shape.elem_cnt() * GetSizeOfDataType(in_data_type));
kernel_state->IncreaseCount();
#endif
}
bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
};
REGISTER_USER_KERNEL("nvtx_end")
.SetCreateFn<NvtxEndKernel>()
.SetIsMatchedHob(user_op::HobDeviceType() == DeviceType::kCUDA)
.SetInplaceProposalFn([](const user_op::InferContext&,
user_op::AddInplaceArgPair AddInplaceArgPairFn) -> Maybe<void> {
OF_RETURN_IF_ERROR(AddInplaceArgPairFn("out", 0, "in", 0, false));
return Maybe<void>::Ok();
});
} // namespace oneflow
......@@ -867,7 +867,7 @@ void StatefulOpKernel::Compute(eager::CallContext* call_ctx, DeviceCtx* device_c
auto* compute_ctx = &compute_context;
OF_PROFILER_RANGE_GUARD("Compute");
if (Singleton<profiler::ProfileManager>::Get()) {
#if defined(WITH_CUDA)
#if defined(WITH_CUDA) || defined(WITH_ROCM)
const auto CalMemorySize = [compute_ctx](const one::ArgVec& args) -> int64_t {
const auto Func = [compute_ctx](int64_t mem_size, const auto& pair) {
const auto tensor = compute_ctx->Tensor4ArgNameAndIndex(pair.first, pair.second);
......@@ -878,13 +878,13 @@ void StatefulOpKernel::Compute(eager::CallContext* call_ctx, DeviceCtx* device_c
#endif
auto er_guard = CHECK_JUST(profiler::EventRecorder::CreateKernelEventRecorder(
op_type_name(),
#if defined(WITH_CUDA)
#if defined(WITH_CUDA) || defined(WITH_ROCM)
[compute_ctx, CalMemorySize]() -> int64_t {
return CalMemorySize(compute_ctx->inputs()) + CalMemorySize(compute_ctx->outputs());
},
#endif
[compute_ctx]() -> std::vector<ShapeView> {
std::vector<ShapeView> shapes;
[compute_ctx]() -> std::vector<Shape> {
std::vector<Shape> shapes;
for (const auto& pair : compute_ctx->inputs()) {
shapes.emplace_back(
compute_ctx->TensorDesc4ArgNameAndIndex(pair.first, pair.second)->shape());
......
import numpy as np
import oneflow as flow
def fused_dot_feature_interaction(x,
y,
self_interaction=False,
output_padding=0,
output_concat=None,
dtype=flow.float32
):
# (bs, es) = x.shape
(bs, dims, es) = y.shape
if self_interaction:
offset = 1
else:
offset = 0
li = flow.tensor([i for i in range(dims + 1) for j in range(i + offset)])
lj = flow.tensor([j for i in range(dims + 1) for j in range(i + offset)])
T = flow.cat(
[
flow.reshape(x, (bs, 1, es)),
y,
],
dim=1,
)
Z = flow.matmul(T, T, transpose_b=True)
# gather_nd not support half, so cast to float32
Z = flow.cast(Z, flow.float32)
Zflat = Z[:, li, lj]
Zflat = flow.cast(Zflat, dtype)
if output_concat is not None:
R = flow.cat([output_concat, Zflat], dim=1)
else:
R = Zflat
if output_padding != 0:
padding_tensor = flow.tensor(
np.zeros((bs, output_padding)).astype(np.float32),
device="cuda",
requires_grad=False,
)
R = flow.cat([R, padding_tensor], dim=1)
return R
......@@ -80,7 +80,7 @@ def _test_lenet(
with oneflow.profiler.record_function("lenet_backward_total_time") as f:
eager_res.sum().backward()
events = prof.key_averages(group_by_input_shape=True)
print(events)
conv_event = get_event(
events, "conv2d", "[(2,3,32,32), (6,3,5,5)]" if record_shapes else "-"
)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment