Commit 8f7de847 authored by yuguo960516yuguo's avatar yuguo960516yuguo
Browse files

dtk

parent f262efc9
Pipeline #248 failed with stages
in 0 seconds
/* /*
Copyright 2020 The OneFlow Authors. All rights reserved. Copyright 2020 The OneFlow Authors. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License"); Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License. you may not use this file except in compliance with the License.
You may obtain a copy of the License at You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0 http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS, distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. limitations under the License.
*/ */
#include "hip/hip_runtime.h" #include "hip/hip_runtime.h"
#include "oneflow/core/ndarray/ndarray_assign_core.h" #include "oneflow/core/ndarray/ndarray_assign_core.h"
#include "oneflow/core/device/cuda_util.h" #include "oneflow/core/device/cuda_util.h"
#include "oneflow/core/kernel/kernel_util.h" #include "oneflow/core/kernel/kernel_util.h"
namespace oneflow { namespace oneflow {
namespace { namespace {
template<typename T, typename X, int NDIMS> template<typename T, typename X, int NDIMS>
__global__ void NdarrayAssignReducedGpu(XpuVarNdarray<T> y, __global__ void NdarrayAssignReducedGpu(XpuVarNdarray<T> y,
const XpuReducedNdarray<X, NDIMS> reduced) { const XpuReducedNdarray<X, NDIMS> reduced) {
NdarrayAssignCore<T, X, NDIMS>::Assign(y, reduced); NdarrayAssignCore<T, X, NDIMS>::Assign(y, reduced);
} }
template<typename T, typename X, int NDIMS> template<typename T, typename X, int NDIMS>
__global__ void NdarrayAssignGpu(XpuVarNdarray<T> y, const XpuVarNdarray<const X> x) { __global__ void NdarrayAssignGpu(XpuVarNdarray<T> y, const XpuVarNdarray<const X> x) {
NdarrayAssignCore<T, X, NDIMS>::Assign(y, x); NdarrayAssignCore<T, X, NDIMS>::Assign(y, x);
} }
} // namespace } // namespace
template<typename T, typename X, int NDIMS> template<typename T, typename X, int NDIMS>
struct NdarrayAssignCoreWrapper<DeviceType::kCUDA, T, X, NDIMS> final { struct NdarrayAssignCoreWrapper<DeviceType::kCUDA, T, X, NDIMS> final {
static void Assign(ep::Stream* stream, XpuVarNdarray<T>* y, static void Assign(ep::Stream* stream, XpuVarNdarray<T>* y,
const XpuReducedNdarray<X, NDIMS>& reduced) { const XpuReducedNdarray<X, NDIMS>& reduced) {
size_t n = y->host_shape().HostElemNum(); size_t n = y->host_shape().HostElemNum();
RUN_CUDA_KERNEL((NdarrayAssignReducedGpu<T, X, NDIMS>), stream, n, *y, reduced); RUN_CUDA_KERNEL((NdarrayAssignReducedGpu<T, X, NDIMS>), stream, n, *y, reduced);
} }
static void Assign(ep::Stream* ctx, const XpuVarNdarray<T>& y, const XpuVarNdarray<const X>& x) { static void Assign(ep::Stream* ctx, const XpuVarNdarray<T>& y, const XpuVarNdarray<const X>& x) {
size_t n = y.host_shape().HostElemNum(); size_t n = y.host_shape().HostElemNum();
if (n == 0) { return; } if (n == 0) { return; }
RUN_CUDA_KERNEL((NdarrayAssignGpu<T, X, NDIMS>), ctx, n, y, x); RUN_CUDA_KERNEL((NdarrayAssignGpu<T, X, NDIMS>), ctx, n, y, x);
} }
}; };
#define INSTANTIATE_NDARRAY_ASSIGN(ret_dtype_pair, dtype_pair, NDIMS) \ #define INSTANTIATE_NDARRAY_ASSIGN(ret_dtype_pair, dtype_pair, NDIMS) \
template struct NdarrayAssignCoreWrapper<DeviceType::kCUDA, OF_PP_PAIR_FIRST(ret_dtype_pair), \ template struct NdarrayAssignCoreWrapper<DeviceType::kCUDA, OF_PP_PAIR_FIRST(ret_dtype_pair), \
OF_PP_PAIR_FIRST(dtype_pair), NDIMS>; OF_PP_PAIR_FIRST(dtype_pair), NDIMS>;
OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE( OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(
INSTANTIATE_NDARRAY_ASSIGN, INSTANTIATE_NDARRAY_ASSIGN,
ARITHMETIC_DATA_TYPE_SEQ UNSIGNED_INT_DATA_TYPE_SEQ BOOL_DATA_TYPE_SEQ, ARITHMETIC_DATA_TYPE_SEQ UNSIGNED_INT_DATA_TYPE_SEQ BOOL_DATA_TYPE_SEQ,
ARITHMETIC_DATA_TYPE_SEQ UNSIGNED_INT_DATA_TYPE_SEQ BOOL_DATA_TYPE_SEQ, DIM_SEQ); ARITHMETIC_DATA_TYPE_SEQ UNSIGNED_INT_DATA_TYPE_SEQ BOOL_DATA_TYPE_SEQ, DIM_SEQ);
OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INSTANTIATE_NDARRAY_ASSIGN, HALF_DATA_TYPE_SEQ, HALF_DATA_TYPE_SEQ, OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INSTANTIATE_NDARRAY_ASSIGN, HALF_DATA_TYPE_SEQ, HALF_DATA_TYPE_SEQ,
DIM_SEQ); DIM_SEQ);
} // namespace oneflow } // namespace oneflow
\ No newline at end of file
/* /*
Copyright 2020 The OneFlow Authors. All rights reserved. Copyright 2020 The OneFlow Authors. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License"); Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License. you may not use this file except in compliance with the License.
You may obtain a copy of the License at You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0 http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS, distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. limitations under the License.
*/ */
#include "fmt/core.h" #include "fmt/core.h"
#include "fmt/format.h" #include "fmt/format.h"
#include "oneflow/core/profiler/event.h" #include "oneflow/core/profiler/event.h"
#include "oneflow/core/profiler/util.h" #include "oneflow/core/profiler/util.h"
using json = nlohmann::json; using json = nlohmann::json;
namespace oneflow { namespace oneflow {
namespace profiler { namespace profiler {
nlohmann::json IEvent::ToJson() { nlohmann::json IEvent::ToJson() {
return json{{"name", name_}, {"time", GetDuration<double>()}, {"input_shapes", "-"}}; return json{{"name", name_}, {"time", GetDuration<double>()}, {"input_shapes", "-"}};
} }
void IEvent::SetStartedAt(double t) { started_at_ = t; } void IEvent::SetStartedAt(double t) { started_at_ = t; }
void IEvent::SetFinishedAt(double t) { finished_at_ = t; } void IEvent::SetFinishedAt(double t) { finished_at_ = t; }
void IEvent::Start() { SetStartedAt(GetTimeNow()); } void IEvent::Start() { SetStartedAt(GetTimeNow()); }
void IEvent::Finish() { SetFinishedAt(GetTimeNow()); } void IEvent::Finish() { SetFinishedAt(GetTimeNow()); }
bool IEvent::IsChildOf(const IEvent* e) { bool IEvent::IsChildOf(const IEvent* e) {
if (!e) { return false; } if (!e) { return false; }
if (this == e) { return false; } if (this == e) { return false; }
return GetStartedAt<double>() >= e->GetStartedAt<double>() return GetStartedAt<double>() >= e->GetStartedAt<double>()
&& GetFinishedAt<double>() <= e->GetFinishedAt<double>(); && GetFinishedAt<double>() <= e->GetFinishedAt<double>();
} }
const std::string& IEvent::GetName() const { return name_; } const std::string& IEvent::GetName() const { return name_; }
std::string CustomEvent::Key() { return name_; } std::string CustomEvent::Key() { return name_; }
nlohmann::json CustomEvent::ToJson() { nlohmann::json CustomEvent::ToJson() {
auto j = IEvent::ToJson(); auto j = IEvent::ToJson();
j["type"] = EventType::kCustom; j["type"] = EventType::kCustom;
j["custom_type"] = type_; j["custom_type"] = type_;
return j; return j;
} }
std::shared_ptr<CustomEvent> CustomEvent::Create(const std::string& name, CustomEventType type) { std::shared_ptr<CustomEvent> CustomEvent::Create(const std::string& name, CustomEventType type) {
return std::shared_ptr<CustomEvent>(new CustomEvent(name, type)); return std::shared_ptr<CustomEvent>(new CustomEvent(name, type));
} }
std::string KernelEvent::Key() { return fmt::format("{}.{}", name_, GetFormatedInputShapes()); } std::string KernelEvent::Key() { return fmt::format("{}.{}", name_, GetFormatedInputShapes()); }
nlohmann::json KernelEvent::ToJson() { nlohmann::json KernelEvent::ToJson() {
auto j = IEvent::ToJson(); auto j = IEvent::ToJson();
j["type"] = EventType::kOneflowKernel; j["type"] = EventType::kOneflowKernel;
j["input_shapes"] = GetFormatedInputShapes(); j["input_shapes"] = GetFormatedInputShapes();
#if defined(WITH_CUDA) || defined(WITH_ROCM) #if defined(WITH_CUDA) || defined(WITH_ROCM)
j["memory_size"] = memory_size_; j["memory_size"] = memory_size_;
if (!children_.empty()) { j["children"] = children_; } if (!children_.empty()) { j["children"] = children_; }
#endif // WITH_CUDA #endif // WITH_CUDA
return j; return j;
} }
std::shared_ptr<KernelEvent> KernelEvent::Create( std::shared_ptr<KernelEvent> KernelEvent::Create(
const std::string& name, const std::function<std::vector<Shape>(void)>& shape_getter) { const std::string& name, const std::function<std::vector<Shape>(void)>& shape_getter) {
return std::shared_ptr<KernelEvent>(new KernelEvent(name, shape_getter)); return std::shared_ptr<KernelEvent>(new KernelEvent(name, shape_getter));
} }
std::string KernelEvent::GetFormatedInputShapes(size_t max_num_to_format) { std::string KernelEvent::GetFormatedInputShapes(size_t max_num_to_format) {
if (input_shapes_.size() == 0) { return "-"; } if (input_shapes_.size() == 0) { return "-"; }
std::vector<std::string> shapes_formated(std::min(input_shapes_.size(), max_num_to_format)); std::vector<std::string> shapes_formated(std::min(input_shapes_.size(), max_num_to_format));
for (auto i = 0; i < shapes_formated.size(); ++i) { for (auto i = 0; i < shapes_formated.size(); ++i) {
const std::string current_shape = input_shapes_[i].ToString(); const std::string current_shape = input_shapes_[i].ToString();
shapes_formated[i] = current_shape == "()" ? "scalar" : current_shape; shapes_formated[i] = current_shape == "()" ? "scalar" : current_shape;
} }
if (input_shapes_.size() > max_num_to_format) { shapes_formated.emplace_back("..."); } if (input_shapes_.size() > max_num_to_format) { shapes_formated.emplace_back("..."); }
return fmt::format("[{}]", fmt::join(shapes_formated, ", ")); return fmt::format("[{}]", fmt::join(shapes_formated, ", "));
} }
} // namespace profiler } // namespace profiler
} // namespace oneflow } // namespace oneflow
\ No newline at end of file
/* /*
Copyright 2020 The OneFlow Authors. All rights reserved. Copyright 2020 The OneFlow Authors. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License"); Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License. you may not use this file except in compliance with the License.
You may obtain a copy of the License at You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0 http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS, distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. limitations under the License.
*/ */
#ifndef ONEFLOW_CORE_PROFILER_EVENT_H_ #ifndef ONEFLOW_CORE_PROFILER_EVENT_H_
#define ONEFLOW_CORE_PROFILER_EVENT_H_ #define ONEFLOW_CORE_PROFILER_EVENT_H_
#include <functional> #include <functional>
#include <memory> #include <memory>
#include <vector> #include <vector>
#include "nlohmann/json.hpp" #include "nlohmann/json.hpp"
#include "oneflow/core/common/util.h" #include "oneflow/core/common/util.h"
#include "oneflow/core/common/shape_view.h" #include "oneflow/core/common/shape_view.h"
namespace oneflow { namespace oneflow {
namespace profiler { namespace profiler {
class ProfileManager; class ProfileManager;
enum class EventType { enum class EventType {
kCustom, // has three kinds kCustom, // has three kinds
kOneflowKernel // OneFlow cpu/cuda kernel kOneflowKernel // OneFlow cpu/cuda kernel
}; };
enum class CustomEventType { enum class CustomEventType {
kDefault, // for record_function kDefault, // for record_function
kCudaKernel, // cuda kernel kCudaKernel, // cuda kernel
kCudaRuntime // something like cudaLaunchKernel kCudaRuntime // something like cudaLaunchKernel
}; };
enum class EventTimeUnit { kNS, kUS }; enum class EventTimeUnit { kNS, kUS };
class IEvent { class IEvent {
public: public:
OF_DISALLOW_COPY_AND_MOVE(IEvent); OF_DISALLOW_COPY_AND_MOVE(IEvent);
IEvent() = delete; IEvent() = delete;
IEvent(const std::string& name, EventTimeUnit time_unit) : name_(name), time_unit_(time_unit) {} IEvent(const std::string& name, EventTimeUnit time_unit) : name_(name), time_unit_(time_unit) {}
virtual std::string Key() = 0; virtual std::string Key() = 0;
virtual nlohmann::json ToJson(); virtual nlohmann::json ToJson();
virtual ~IEvent() = default; virtual ~IEvent() = default;
virtual void Start(); virtual void Start();
virtual void Finish(); virtual void Finish();
bool IsChildOf(const IEvent* e); bool IsChildOf(const IEvent* e);
const std::string& GetName() const; const std::string& GetName() const;
template<typename T> template<typename T>
const T GetDuration(EventTimeUnit time_unit = EventTimeUnit::kUS) const; const T GetDuration(EventTimeUnit time_unit = EventTimeUnit::kUS) const;
template<typename T> template<typename T>
const T GetStartedAt(EventTimeUnit time_unit = EventTimeUnit::kUS) const; const T GetStartedAt(EventTimeUnit time_unit = EventTimeUnit::kUS) const;
template<typename T> template<typename T>
const T GetFinishedAt(EventTimeUnit time_unit = EventTimeUnit::kUS) const; const T GetFinishedAt(EventTimeUnit time_unit = EventTimeUnit::kUS) const;
protected: protected:
virtual void SetStartedAt(double t); virtual void SetStartedAt(double t);
virtual void SetFinishedAt(double t); virtual void SetFinishedAt(double t);
std::string name_; std::string name_;
EventTimeUnit time_unit_; EventTimeUnit time_unit_;
double started_at_ = 0; double started_at_ = 0;
double finished_at_ = 0; double finished_at_ = 0;
}; };
inline double ConvertTime(double time_, EventTimeUnit src_time_unit, EventTimeUnit dst_time_unit) { inline double ConvertTime(double time_, EventTimeUnit src_time_unit, EventTimeUnit dst_time_unit) {
if (src_time_unit == EventTimeUnit::kNS && dst_time_unit == EventTimeUnit::kUS) { if (src_time_unit == EventTimeUnit::kNS && dst_time_unit == EventTimeUnit::kUS) {
return time_ / 1000; return time_ / 1000;
} }
if (src_time_unit == EventTimeUnit::kUS && dst_time_unit == EventTimeUnit::kNS) { if (src_time_unit == EventTimeUnit::kUS && dst_time_unit == EventTimeUnit::kNS) {
return time_ * 1000; return time_ * 1000;
} }
return time_; return time_;
} }
template<> template<>
const inline double IEvent::GetStartedAt<double>(EventTimeUnit time_unit) const { const inline double IEvent::GetStartedAt<double>(EventTimeUnit time_unit) const {
return ConvertTime(started_at_, time_unit_, time_unit); return ConvertTime(started_at_, time_unit_, time_unit);
} }
template<> template<>
const inline time_t IEvent::GetStartedAt<time_t>(EventTimeUnit time_unit) const { const inline time_t IEvent::GetStartedAt<time_t>(EventTimeUnit time_unit) const {
return static_cast<time_t>(GetStartedAt<double>(time_unit)); return static_cast<time_t>(GetStartedAt<double>(time_unit));
} }
template<> template<>
const inline double IEvent::GetFinishedAt<double>(EventTimeUnit time_unit) const { const inline double IEvent::GetFinishedAt<double>(EventTimeUnit time_unit) const {
return ConvertTime(finished_at_, time_unit_, time_unit); return ConvertTime(finished_at_, time_unit_, time_unit);
} }
template<> template<>
const inline time_t IEvent::GetFinishedAt<time_t>(EventTimeUnit time_unit) const { const inline time_t IEvent::GetFinishedAt<time_t>(EventTimeUnit time_unit) const {
return static_cast<time_t>(GetFinishedAt<double>(time_unit)); return static_cast<time_t>(GetFinishedAt<double>(time_unit));
} }
template<> template<>
const inline double IEvent::GetDuration<double>(EventTimeUnit time_unit) const { const inline double IEvent::GetDuration<double>(EventTimeUnit time_unit) const {
return GetFinishedAt<double>(time_unit) - GetStartedAt<double>(time_unit); return GetFinishedAt<double>(time_unit) - GetStartedAt<double>(time_unit);
} }
template<> template<>
const inline time_t IEvent::GetDuration<time_t>(EventTimeUnit time_unit) const { const inline time_t IEvent::GetDuration<time_t>(EventTimeUnit time_unit) const {
return static_cast<time_t>(GetDuration<double>(time_unit)); return static_cast<time_t>(GetDuration<double>(time_unit));
} }
class CustomEvent final : public IEvent { class CustomEvent final : public IEvent {
public: public:
friend class ProfileManager; friend class ProfileManager;
std::string Key() override; std::string Key() override;
nlohmann::json ToJson() override; nlohmann::json ToJson() override;
static std::shared_ptr<CustomEvent> Create(const std::string& name, static std::shared_ptr<CustomEvent> Create(const std::string& name,
CustomEventType type = CustomEventType::kDefault); CustomEventType type = CustomEventType::kDefault);
private: private:
CustomEventType type_; CustomEventType type_;
CustomEvent(const std::string& custom_name, CustomEventType type) CustomEvent(const std::string& custom_name, CustomEventType type)
: IEvent(custom_name, : IEvent(custom_name,
type == CustomEventType::kDefault ? EventTimeUnit::kNS : EventTimeUnit::kUS), type == CustomEventType::kDefault ? EventTimeUnit::kNS : EventTimeUnit::kUS),
type_(type) {} type_(type) {}
}; };
class KernelEvent final : public IEvent { class KernelEvent final : public IEvent {
public: public:
std::string Key() override; std::string Key() override;
nlohmann::json ToJson() override; nlohmann::json ToJson() override;
static std::shared_ptr<KernelEvent> Create( static std::shared_ptr<KernelEvent> Create(
const std::string& name, const std::function<std::vector<Shape>(void)>& shape_getter); const std::string& name, const std::function<std::vector<Shape>(void)>& shape_getter);
#if defined(WITH_CUDA) || defined(WITH_ROCM) #if defined(WITH_CUDA) || defined(WITH_ROCM)
void SetMemorySize(int64_t memory_size) { memory_size_ = memory_size; } void SetMemorySize(int64_t memory_size) { memory_size_ = memory_size; }
void AddChildEvent(const std::shared_ptr<IEvent>& e) { children_.emplace(e); } void AddChildEvent(const std::shared_ptr<IEvent>& e) { children_.emplace(e); }
bool AddChildEventIfSo(const std::shared_ptr<IEvent>& e) { bool AddChildEventIfSo(const std::shared_ptr<IEvent>& e) {
if (e->IsChildOf(dynamic_cast<IEvent*>(this))) { if (e->IsChildOf(dynamic_cast<IEvent*>(this))) {
children_.emplace(e); children_.emplace(e);
return true; return true;
} }
return false; return false;
} }
bool HasChildEvent(const std::shared_ptr<IEvent>& e) { return children_.count(e); } bool HasChildEvent(const std::shared_ptr<IEvent>& e) { return children_.count(e); }
void WalkAmongChildren(const std::function<void(const std::shared_ptr<IEvent>& e)>& f) const { void WalkAmongChildren(const std::function<void(const std::shared_ptr<IEvent>& e)>& f) const {
for (const auto& x : children_) { f(x); } for (const auto& x : children_) { f(x); }
} }
#endif // WITH_CUDA #endif // WITH_CUDA
private: private:
KernelEvent(const std::string& kernel_name, KernelEvent(const std::string& kernel_name,
const std::function<std::vector<Shape>(void)>& shape_getter) const std::function<std::vector<Shape>(void)>& shape_getter)
: IEvent(kernel_name, EventTimeUnit::kNS) { : IEvent(kernel_name, EventTimeUnit::kNS) {
if (shape_getter) { input_shapes_ = shape_getter(); } if (shape_getter) { input_shapes_ = shape_getter(); }
} }
#if defined(WITH_CUDA) || defined(WITH_ROCM) #if defined(WITH_CUDA) || defined(WITH_ROCM)
int64_t memory_size_ = -1; int64_t memory_size_ = -1;
std::set<std::shared_ptr<IEvent>> children_; std::set<std::shared_ptr<IEvent>> children_;
#endif // WITH_CUDA #endif // WITH_CUDA
std::vector<Shape> input_shapes_; std::vector<Shape> input_shapes_;
std::string GetFormatedInputShapes(size_t max_num_to_format = 4); std::string GetFormatedInputShapes(size_t max_num_to_format = 4);
}; };
} // namespace profiler } // namespace profiler
} // namespace oneflow } // namespace oneflow
namespace nlohmann { namespace nlohmann {
inline void to_json(json& j, const std::shared_ptr<::oneflow::profiler::IEvent>& event) { inline void to_json(json& j, const std::shared_ptr<::oneflow::profiler::IEvent>& event) {
j = event->ToJson(); j = event->ToJson();
} }
} // namespace nlohmann } // namespace nlohmann
#endif // ONEFLOW_CORE_PROFILER_EVENT_H_ #endif // ONEFLOW_CORE_PROFILER_EVENT_H_
/* /*
Copyright 2020 The OneFlow Authors. All rights reserved. Copyright 2020 The OneFlow Authors. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License"); Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License. you may not use this file except in compliance with the License.
You may obtain a copy of the License at You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0 http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS, distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. limitations under the License.
*/ */
#ifndef ONEFLOW_CORE_PROFILER_EVENT_RECORDER_H_ #ifndef ONEFLOW_CORE_PROFILER_EVENT_RECORDER_H_
#define ONEFLOW_CORE_PROFILER_EVENT_RECORDER_H_ #define ONEFLOW_CORE_PROFILER_EVENT_RECORDER_H_
#include "oneflow/core/common/util.h" #include "oneflow/core/common/util.h"
#include "oneflow/core/profiler/event.h" #include "oneflow/core/profiler/event.h"
namespace oneflow { namespace oneflow {
namespace profiler { namespace profiler {
class EventRecorder { class EventRecorder {
public: public:
using ShapeGetterFuncType = std::function<std::vector<Shape>(void)>; using ShapeGetterFuncType = std::function<std::vector<Shape>(void)>;
OF_DISALLOW_COPY_AND_MOVE(EventRecorder); OF_DISALLOW_COPY_AND_MOVE(EventRecorder);
explicit EventRecorder(const std::shared_ptr<IEvent>& event) : event_(event) { explicit EventRecorder(const std::shared_ptr<IEvent>& event) : event_(event) {
CHECK_JUST(RegisterEventToProfileManager(event)); CHECK_JUST(RegisterEventToProfileManager(event));
event_->Start(); event_->Start();
} }
Maybe<void> RegisterEventToProfileManager(const std::shared_ptr<IEvent>& event); Maybe<void> RegisterEventToProfileManager(const std::shared_ptr<IEvent>& event);
~EventRecorder() { ~EventRecorder() {
if (event_) { if (event_) {
event_->Finish(); event_->Finish();
event_.reset(); event_.reset();
} }
} }
static std::shared_ptr<EventRecorder> CreateCustomEventRecorder(const std::string& name); static std::shared_ptr<EventRecorder> CreateCustomEventRecorder(const std::string& name);
static Maybe<EventRecorder> CreateKernelEventRecorder( static Maybe<EventRecorder> CreateKernelEventRecorder(
const std::string& name, const std::string& name,
#if defined(WITH_CUDA) || defined(WITH_ROCM) #if defined(WITH_CUDA) || defined(WITH_ROCM)
const std::function<int64_t()>& memory_size_getter, const std::function<int64_t()>& memory_size_getter,
#endif #endif
const ShapeGetterFuncType& shape_getter); const ShapeGetterFuncType& shape_getter);
private: private:
std::shared_ptr<IEvent> event_; std::shared_ptr<IEvent> event_;
}; };
} // namespace profiler } // namespace profiler
} // namespace oneflow } // namespace oneflow
#endif // ONEFLOW_CORE_PROFILER_EVENT_RECORDER_H_ #endif // ONEFLOW_CORE_PROFILER_EVENT_RECORDER_H_
/* /*
Copyright 2020 The OneFlow Authors. All rights reserved. Copyright 2020 The OneFlow Authors. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License"); Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License. you may not use this file except in compliance with the License.
You may obtain a copy of the License at You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0 http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS, distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. limitations under the License.
*/ */
#ifndef ONEFLOW_CORE_VM_SYNC_VM_MODE_GUARD_H_ #ifndef ONEFLOW_CORE_VM_SYNC_VM_MODE_GUARD_H_
#define ONEFLOW_CORE_VM_SYNC_VM_MODE_GUARD_H_ #define ONEFLOW_CORE_VM_SYNC_VM_MODE_GUARD_H_
#include "oneflow/core/common/thread_local_guard.h" #include "oneflow/core/common/thread_local_guard.h"
namespace oneflow { namespace oneflow {
enum class SyncVmMode { enum class SyncVmMode {
kInvalid = 0, kInvalid = 0,
kEnable = 1, kEnable = 1,
kDisable = 2, kDisable = 2,
}; };
class SyncVmModeGuard final : public ThreadLocalGuard<SyncVmMode> { class SyncVmModeGuard final : public ThreadLocalGuard<SyncVmMode> {
public: public:
using ThreadLocalGuard<SyncVmMode>::ThreadLocalGuard; using ThreadLocalGuard<SyncVmMode>::ThreadLocalGuard;
~SyncVmModeGuard() = default; ~SyncVmModeGuard() = default;
static bool IsCurrentSyncVmMode() { static bool IsCurrentSyncVmMode() {
const auto& opt_sync_mode = Current(); const auto& opt_sync_mode = Current();
return opt_sync_mode.has_value() && CHECK_JUST(opt_sync_mode) == SyncVmMode::kEnable; return opt_sync_mode.has_value() && CHECK_JUST(opt_sync_mode) == SyncVmMode::kEnable;
} }
}; };
} // namespace oneflow } // namespace oneflow
#endif // ONEFLOW_CORE_VM_SYNC_VM_MODE_GUARD_H_ #endif // ONEFLOW_CORE_VM_SYNC_VM_MODE_GUARD_H_
\ No newline at end of file
/* /*
Copyright 2020 The OneFlow Authors. All rights reserved. Copyright 2020 The OneFlow Authors. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License"); Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License. you may not use this file except in compliance with the License.
You may obtain a copy of the License at You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0 http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS, distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. limitations under the License.
*/ */
#include "hip/hip_runtime.h" #include "hip/hip_runtime.h"
#include "oneflow/core/device/cuda_util.h" #include "oneflow/core/device/cuda_util.h"
#include "oneflow/core/framework/framework.h" #include "oneflow/core/framework/framework.h"
#include "oneflow/core/kernel/kernel_util.hip.h" #include "oneflow/core/kernel/kernel_util.hip.h"
#include "oneflow/core/common/data_type.h" #include "oneflow/core/common/data_type.h"
#include "oneflow/core/kernel/util/cuda_half_util.h" #include "oneflow/core/kernel/util/cuda_half_util.h"
#include "oneflow/core/hip/atomic.hip.h" #include "oneflow/core/hip/atomic.hip.h"
#include "oneflow/core/operator/operator_util.h" #include "oneflow/core/operator/operator_util.h"
#include "oneflow/user/utils/pool_util.h" #include "oneflow/user/utils/pool_util.h"
#include <algorithm> #include <algorithm>
#include <cfloat> #include <cfloat>
#include <cmath> #include <cmath>
namespace oneflow { namespace oneflow {
namespace user_op { namespace user_op {
#define START_IND(a, b, c) (int)std::floor((float)(a * c) / b) #define START_IND(a, b, c) (int)std::floor((float)(a * c) / b)
#define END_IND(a, b, c) (int)std::ceil((float)((a + 1) * c) / b) #define END_IND(a, b, c) (int)std::ceil((float)((a + 1) * c) / b)
#define START_IND_INT(a, b, c) ((a * c) / b) #define START_IND_INT(a, b, c) ((a * c) / b)
#define END_IND_INT(a, b, c) (((a + 1) * c + b - 1) / b) #define END_IND_INT(a, b, c) (((a + 1) * c + b - 1) / b)
template<typename T> template<typename T>
__global__ void InitPtr(int elements, T* ptr) { __global__ void InitPtr(int elements, T* ptr) {
int gid = (blockDim.x * blockIdx.x) + threadIdx.x; int gid = (blockDim.x * blockIdx.x) + threadIdx.x;
int step = gridDim.x * blockDim.x; int step = gridDim.x * blockDim.x;
while (gid < elements) { while (gid < elements) {
ptr[gid] = static_cast<T>(0); ptr[gid] = static_cast<T>(0);
gid += step; gid += step;
} }
} }
inline Shape GetShape5D(const Shape& shape, const std::string& data_format, int32_t dim) { inline Shape GetShape5D(const Shape& shape, const std::string& data_format, int32_t dim) {
FixedDimVector shape_3d = {GetInDim(shape, data_format, 0, dim), FixedDimVector shape_3d = {GetInDim(shape, data_format, 0, dim),
GetInDim(shape, data_format, 1, dim), GetInDim(shape, data_format, 1, dim),
GetInDim(shape, data_format, 2, dim)}; GetInDim(shape, data_format, 2, dim)};
return Shape({shape.At(0), shape.At(1), shape_3d.at(0), shape_3d.at(1), shape_3d.at(2)}); return Shape({shape.At(0), shape.At(1), shape_3d.at(0), shape_3d.at(1), shape_3d.at(2)});
} }
template<typename T> template<typename T>
__global__ void AdaptiveAvgPoolCudaKernel(const T* input, T* output, int num_elems, int in_d, __global__ void AdaptiveAvgPoolCudaKernel(const T* input, T* output, int num_elems, int in_d,
int in_h, int in_w, int out_d, int out_h, int out_w) { int in_h, int in_w, int out_d, int out_h, int out_w) {
const int out_panel_size = out_d * out_h * out_w; const int out_panel_size = out_d * out_h * out_w;
const int in_panel_size = in_d * in_h * in_w; const int in_panel_size = in_d * in_h * in_w;
CUDA_1D_KERNEL_LOOP(idx, num_elems) { CUDA_1D_KERNEL_LOOP(idx, num_elems) {
// TODO (Tianyu): Replace following codes with 'NdIndexOffsetHelper' // TODO (Tianyu): Replace following codes with 'NdIndexOffsetHelper'
int bc_idx = idx / out_panel_size; int bc_idx = idx / out_panel_size;
int out_d_idx = (idx % out_panel_size) / out_w / out_h; int out_d_idx = (idx % out_panel_size) / out_w / out_h;
int out_h_idx = (idx % out_panel_size) % (out_h * out_w) / out_w; int out_h_idx = (idx % out_panel_size) % (out_h * out_w) / out_w;
int out_w_idx = (idx % out_panel_size) % (out_h * out_w) % out_w; int out_w_idx = (idx % out_panel_size) % (out_h * out_w) % out_w;
int in_start_d = START_IND(out_d_idx, out_d, in_d); int in_start_d = START_IND(out_d_idx, out_d, in_d);
int in_end_d = END_IND(out_d_idx, out_d, in_d); int in_end_d = END_IND(out_d_idx, out_d, in_d);
int k_d = in_end_d - in_start_d; int k_d = in_end_d - in_start_d;
int in_start_h = START_IND(out_h_idx, out_h, in_h); int in_start_h = START_IND(out_h_idx, out_h, in_h);
int in_end_h = END_IND(out_h_idx, out_h, in_h); int in_end_h = END_IND(out_h_idx, out_h, in_h);
int k_h = in_end_h - in_start_h; int k_h = in_end_h - in_start_h;
int in_start_w = START_IND(out_w_idx, out_w, in_w); int in_start_w = START_IND(out_w_idx, out_w, in_w);
int in_end_w = END_IND(out_w_idx, out_w, in_w); int in_end_w = END_IND(out_w_idx, out_w, in_w);
int k_w = in_end_w - in_start_w; int k_w = in_end_w - in_start_w;
const T* in_ptr = const T* in_ptr =
input + bc_idx * in_panel_size + in_start_d * in_h * in_w + in_start_h * in_w + in_start_w; input + bc_idx * in_panel_size + in_start_d * in_h * in_w + in_start_h * in_w + in_start_w;
T sum = static_cast<T>(0); T sum = static_cast<T>(0);
for (int id = 0; id < k_d; ++id) { for (int id = 0; id < k_d; ++id) {
for (int ih = 0; ih < k_h; ++ih) { for (int ih = 0; ih < k_h; ++ih) {
for (int iw = 0; iw < k_w; ++iw) { for (int iw = 0; iw < k_w; ++iw) {
T val = *(in_ptr + ih * in_w + iw); T val = *(in_ptr + ih * in_w + iw);
sum += val; sum += val;
} }
} }
in_ptr += in_h * in_w; // next input depth in_ptr += in_h * in_w; // next input depth
} }
// Update output // Update output
output[idx] = sum / k_d / k_h / k_w; output[idx] = sum / k_d / k_h / k_w;
} }
} }
template<typename T> template<typename T>
__global__ void AdaptiveAvgPoolGradCudaKernel(T* input, const T* output, int num_elems, int in_d, __global__ void AdaptiveAvgPoolGradCudaKernel(T* input, const T* output, int num_elems, int in_d,
int in_h, int in_w, int out_d, int out_h, int out_w) { int in_h, int in_w, int out_d, int out_h, int out_w) {
const int out_panel_size = out_d * out_h * out_w; const int out_panel_size = out_d * out_h * out_w;
const int in_panel_size = in_d * in_h * in_w; const int in_panel_size = in_d * in_h * in_w;
CUDA_1D_KERNEL_LOOP(idx, num_elems) { CUDA_1D_KERNEL_LOOP(idx, num_elems) {
// TODO (Tianyu): Replace following codes with 'NdIndexOffsetHelper' // TODO (Tianyu): Replace following codes with 'NdIndexOffsetHelper'
int bc_idx = idx / out_panel_size; int bc_idx = idx / out_panel_size;
int out_d_idx = (idx % out_panel_size) / out_w / out_h; int out_d_idx = (idx % out_panel_size) / out_w / out_h;
int out_h_idx = (idx % out_panel_size) % (out_h * out_w) / out_w; int out_h_idx = (idx % out_panel_size) % (out_h * out_w) / out_w;
int out_w_idx = (idx % out_panel_size) % (out_h * out_w) % out_w; int out_w_idx = (idx % out_panel_size) % (out_h * out_w) % out_w;
int in_start_d = START_IND(out_d_idx, out_d, in_d); int in_start_d = START_IND(out_d_idx, out_d, in_d);
int in_end_d = END_IND(out_d_idx, out_d, in_d); int in_end_d = END_IND(out_d_idx, out_d, in_d);
int k_d = in_end_d - in_start_d; int k_d = in_end_d - in_start_d;
int in_start_h = START_IND(out_h_idx, out_h, in_h); int in_start_h = START_IND(out_h_idx, out_h, in_h);
int in_end_h = END_IND(out_h_idx, out_h, in_h); int in_end_h = END_IND(out_h_idx, out_h, in_h);
int k_h = in_end_h - in_start_h; int k_h = in_end_h - in_start_h;
int in_start_w = START_IND(out_w_idx, out_w, in_w); int in_start_w = START_IND(out_w_idx, out_w, in_w);
int in_end_w = END_IND(out_w_idx, out_w, in_w); int in_end_w = END_IND(out_w_idx, out_w, in_w);
int k_w = in_end_w - in_start_w; int k_w = in_end_w - in_start_w;
const T grad_delta = output[idx] / k_d / k_h / k_w; const T grad_delta = output[idx] / k_d / k_h / k_w;
T* input_ptr = T* input_ptr =
input + bc_idx * in_panel_size + in_start_d * in_h * in_w + in_start_h * in_w + in_start_w; input + bc_idx * in_panel_size + in_start_d * in_h * in_w + in_start_h * in_w + in_start_w;
for (int id = 0; id < k_d; ++id) { for (int id = 0; id < k_d; ++id) {
for (int ih = 0; ih < k_h; ++ih) { for (int ih = 0; ih < k_h; ++ih) {
for (int iw = 0; iw < k_w; ++iw) { for (int iw = 0; iw < k_w; ++iw) {
// TODO (Tianyu): Use 'atmoic::Add' when necessary // TODO (Tianyu): Use 'atmoic::Add' when necessary
cuda::atomic::Add(input_ptr + ih * in_w + iw, grad_delta); cuda::atomic::Add(input_ptr + ih * in_w + iw, grad_delta);
} }
} }
input_ptr += in_h * in_w; // next input depth input_ptr += in_h * in_w; // next input depth
} }
} }
} }
template<typename T> template<typename T>
void AvgForwardCompute(KernelComputeContext* ctx, const int32_t& dim) { void AvgForwardCompute(KernelComputeContext* ctx, const int32_t& dim) {
const Tensor* in_tensor = ctx->Tensor4ArgNameAndIndex("x", 0); const Tensor* in_tensor = ctx->Tensor4ArgNameAndIndex("x", 0);
Tensor* out_tensor = ctx->Tensor4ArgNameAndIndex("y", 0); Tensor* out_tensor = ctx->Tensor4ArgNameAndIndex("y", 0);
const T* in_ptr = in_tensor->dptr<T>(); const T* in_ptr = in_tensor->dptr<T>();
T* out_ptr = out_tensor->mut_dptr<T>(); T* out_ptr = out_tensor->mut_dptr<T>();
const Shape& x_shape = ctx->TensorDesc4ArgNameAndIndex("x", 0)->shape(); const Shape& x_shape = ctx->TensorDesc4ArgNameAndIndex("x", 0)->shape();
const Shape& y_shape = ctx->TensorDesc4ArgNameAndIndex("y", 0)->shape(); const Shape& y_shape = ctx->TensorDesc4ArgNameAndIndex("y", 0)->shape();
// TODO (Tianyu): Support 'channels_last' // TODO (Tianyu): Support 'channels_last'
std::string data_format = "channels_first"; std::string data_format = "channels_first";
const Shape& in = GetShape5D(x_shape, data_format, dim); const Shape& in = GetShape5D(x_shape, data_format, dim);
const Shape& out = GetShape5D(y_shape, data_format, dim); const Shape& out = GetShape5D(y_shape, data_format, dim);
const int out_elems = out_tensor->shape_view().elem_cnt(); const int out_elems = out_tensor->shape_view().elem_cnt();
RUN_CUDA_KERNEL((AdaptiveAvgPoolCudaKernel<T>), ctx->stream(), out_elems, in_ptr, out_ptr, RUN_CUDA_KERNEL((AdaptiveAvgPoolCudaKernel<T>), ctx->stream(), out_elems, in_ptr, out_ptr,
out_elems, in.At(2), in.At(3), in.At(4), out.At(2), out.At(3), out.At(4)); out_elems, in.At(2), in.At(3), in.At(4), out.At(2), out.At(3), out.At(4));
} }
template<typename T> template<typename T>
void AvgBackwardCompute(KernelComputeContext* ctx, const int32_t& dim) { void AvgBackwardCompute(KernelComputeContext* ctx, const int32_t& dim) {
const Tensor* out_tensor = ctx->Tensor4ArgNameAndIndex("dy", 0); const Tensor* out_tensor = ctx->Tensor4ArgNameAndIndex("dy", 0);
Tensor* in_tensor = ctx->Tensor4ArgNameAndIndex("dx", 0); Tensor* in_tensor = ctx->Tensor4ArgNameAndIndex("dx", 0);
const T* out_ptr = out_tensor->dptr<T>(); const T* out_ptr = out_tensor->dptr<T>();
T* in_ptr = in_tensor->mut_dptr<T>(); T* in_ptr = in_tensor->mut_dptr<T>();
const Shape& dx_shape = ctx->TensorDesc4ArgNameAndIndex("dx", 0)->shape(); const Shape& dx_shape = ctx->TensorDesc4ArgNameAndIndex("dx", 0)->shape();
const Shape& dy_shape = ctx->TensorDesc4ArgNameAndIndex("dy", 0)->shape(); const Shape& dy_shape = ctx->TensorDesc4ArgNameAndIndex("dy", 0)->shape();
// TODO (Tianyu): Support 'channels_last' // TODO (Tianyu): Support 'channels_last'
std::string data_format = "channels_first"; std::string data_format = "channels_first";
const Shape& in = GetShape5D(dx_shape, data_format, dim); const Shape& in = GetShape5D(dx_shape, data_format, dim);
const Shape& out = GetShape5D(dy_shape, data_format, dim); const Shape& out = GetShape5D(dy_shape, data_format, dim);
const int in_elems = in_tensor->shape_view().elem_cnt(); const int in_elems = in_tensor->shape_view().elem_cnt();
const int out_elems = out_tensor->shape_view().elem_cnt(); const int out_elems = out_tensor->shape_view().elem_cnt();
RUN_CUDA_KERNEL((InitPtr<T>), ctx->stream(), in_elems, in_elems, in_ptr); RUN_CUDA_KERNEL((InitPtr<T>), ctx->stream(), in_elems, in_elems, in_ptr);
RUN_CUDA_KERNEL((AdaptiveAvgPoolGradCudaKernel<T>), ctx->stream(), out_elems, in_ptr, out_ptr, RUN_CUDA_KERNEL((AdaptiveAvgPoolGradCudaKernel<T>), ctx->stream(), out_elems, in_ptr, out_ptr,
out_elems, in.At(2), in.At(3), in.At(4), out.At(2), out.At(3), out.At(4)); out_elems, in.At(2), in.At(3), in.At(4), out.At(2), out.At(3), out.At(4));
} }
template<DeviceType device_type, typename T> template<DeviceType device_type, typename T>
class GpuAdaptiveAvgPool1dKernel final : public OpKernel { class GpuAdaptiveAvgPool1dKernel final : public OpKernel {
public: public:
GpuAdaptiveAvgPool1dKernel() = default; GpuAdaptiveAvgPool1dKernel() = default;
~GpuAdaptiveAvgPool1dKernel() = default; ~GpuAdaptiveAvgPool1dKernel() = default;
private: private:
using user_op::OpKernel::Compute; using user_op::OpKernel::Compute;
void Compute(KernelComputeContext* ctx) const override { AvgForwardCompute<T>(ctx, 1); } void Compute(KernelComputeContext* ctx) const override { AvgForwardCompute<T>(ctx, 1); }
bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
}; };
template<DeviceType device_type, typename T> template<DeviceType device_type, typename T>
class GpuAdaptiveAvgPool2dKernel final : public OpKernel { class GpuAdaptiveAvgPool2dKernel final : public OpKernel {
public: public:
GpuAdaptiveAvgPool2dKernel() = default; GpuAdaptiveAvgPool2dKernel() = default;
~GpuAdaptiveAvgPool2dKernel() = default; ~GpuAdaptiveAvgPool2dKernel() = default;
private: private:
using user_op::OpKernel::Compute; using user_op::OpKernel::Compute;
void Compute(KernelComputeContext* ctx) const override { AvgForwardCompute<T>(ctx, 2); } void Compute(KernelComputeContext* ctx) const override { AvgForwardCompute<T>(ctx, 2); }
bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
}; };
template<DeviceType device_type, typename T> template<DeviceType device_type, typename T>
class GpuAdaptiveAvgPool3dKernel final : public OpKernel { class GpuAdaptiveAvgPool3dKernel final : public OpKernel {
public: public:
GpuAdaptiveAvgPool3dKernel() = default; GpuAdaptiveAvgPool3dKernel() = default;
~GpuAdaptiveAvgPool3dKernel() = default; ~GpuAdaptiveAvgPool3dKernel() = default;
private: private:
using user_op::OpKernel::Compute; using user_op::OpKernel::Compute;
void Compute(KernelComputeContext* ctx) const override { AvgForwardCompute<T>(ctx, 3); } void Compute(KernelComputeContext* ctx) const override { AvgForwardCompute<T>(ctx, 3); }
bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
}; };
template<DeviceType device_type, typename T> template<DeviceType device_type, typename T>
class GpuAdaptiveAvgPool1dGradKernel final : public OpKernel { class GpuAdaptiveAvgPool1dGradKernel final : public OpKernel {
public: public:
GpuAdaptiveAvgPool1dGradKernel() = default; GpuAdaptiveAvgPool1dGradKernel() = default;
~GpuAdaptiveAvgPool1dGradKernel() = default; ~GpuAdaptiveAvgPool1dGradKernel() = default;
private: private:
using user_op::OpKernel::Compute; using user_op::OpKernel::Compute;
void Compute(KernelComputeContext* ctx) const override { AvgBackwardCompute<T>(ctx, 1); } void Compute(KernelComputeContext* ctx) const override { AvgBackwardCompute<T>(ctx, 1); }
bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
}; };
template<DeviceType device_type, typename T> template<DeviceType device_type, typename T>
class GpuAdaptiveAvgPool2dGradKernel final : public OpKernel { class GpuAdaptiveAvgPool2dGradKernel final : public OpKernel {
public: public:
GpuAdaptiveAvgPool2dGradKernel() = default; GpuAdaptiveAvgPool2dGradKernel() = default;
~GpuAdaptiveAvgPool2dGradKernel() = default; ~GpuAdaptiveAvgPool2dGradKernel() = default;
private: private:
using user_op::OpKernel::Compute; using user_op::OpKernel::Compute;
void Compute(KernelComputeContext* ctx) const override { AvgBackwardCompute<T>(ctx, 2); } void Compute(KernelComputeContext* ctx) const override { AvgBackwardCompute<T>(ctx, 2); }
bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
}; };
template<DeviceType device_type, typename T> template<DeviceType device_type, typename T>
class GpuAdaptiveAvgPool3dGradKernel final : public OpKernel { class GpuAdaptiveAvgPool3dGradKernel final : public OpKernel {
public: public:
GpuAdaptiveAvgPool3dGradKernel() = default; GpuAdaptiveAvgPool3dGradKernel() = default;
~GpuAdaptiveAvgPool3dGradKernel() = default; ~GpuAdaptiveAvgPool3dGradKernel() = default;
private: private:
using user_op::OpKernel::Compute; using user_op::OpKernel::Compute;
void Compute(KernelComputeContext* ctx) const override { AvgBackwardCompute<T>(ctx, 3); } void Compute(KernelComputeContext* ctx) const override { AvgBackwardCompute<T>(ctx, 3); }
bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
}; };
#define REGISTER_CUDA_ADAPTIVE_AVGPOOL_KERNEL(device, dtype) \ #define REGISTER_CUDA_ADAPTIVE_AVGPOOL_KERNEL(device, dtype) \
REGISTER_USER_KERNEL("adaptive_avg_pool1d") \ REGISTER_USER_KERNEL("adaptive_avg_pool1d") \
.SetCreateFn<GpuAdaptiveAvgPool1dKernel<device, dtype>>() \ .SetCreateFn<GpuAdaptiveAvgPool1dKernel<device, dtype>>() \
.SetIsMatchedHob((HobDeviceType() == device) \ .SetIsMatchedHob((HobDeviceType() == device) \
&& (HobDataType("y", 0) == GetDataType<dtype>::value)); \ && (HobDataType("y", 0) == GetDataType<dtype>::value)); \
REGISTER_USER_KERNEL("adaptive_avg_pool2d") \ REGISTER_USER_KERNEL("adaptive_avg_pool2d") \
.SetCreateFn<GpuAdaptiveAvgPool2dKernel<device, dtype>>() \ .SetCreateFn<GpuAdaptiveAvgPool2dKernel<device, dtype>>() \
.SetIsMatchedHob((HobDeviceType() == device) \ .SetIsMatchedHob((HobDeviceType() == device) \
&& (HobDataType("y", 0) == GetDataType<dtype>::value)); \ && (HobDataType("y", 0) == GetDataType<dtype>::value)); \
REGISTER_USER_KERNEL("adaptive_avg_pool3d") \ REGISTER_USER_KERNEL("adaptive_avg_pool3d") \
.SetCreateFn<GpuAdaptiveAvgPool3dKernel<device, dtype>>() \ .SetCreateFn<GpuAdaptiveAvgPool3dKernel<device, dtype>>() \
.SetIsMatchedHob((HobDeviceType() == device) \ .SetIsMatchedHob((HobDeviceType() == device) \
&& (HobDataType("y", 0) == GetDataType<dtype>::value)); && (HobDataType("y", 0) == GetDataType<dtype>::value));
REGISTER_CUDA_ADAPTIVE_AVGPOOL_KERNEL(DeviceType::kCUDA, float); REGISTER_CUDA_ADAPTIVE_AVGPOOL_KERNEL(DeviceType::kCUDA, float);
REGISTER_CUDA_ADAPTIVE_AVGPOOL_KERNEL(DeviceType::kCUDA, double); REGISTER_CUDA_ADAPTIVE_AVGPOOL_KERNEL(DeviceType::kCUDA, double);
REGISTER_CUDA_ADAPTIVE_AVGPOOL_KERNEL(DeviceType::kCUDA, int); REGISTER_CUDA_ADAPTIVE_AVGPOOL_KERNEL(DeviceType::kCUDA, int);
#define REGISTER_CUDA_ADAPTIVE_AVGPOOL_BACKWARD_KERNEL(device, dtype) \ #define REGISTER_CUDA_ADAPTIVE_AVGPOOL_BACKWARD_KERNEL(device, dtype) \
REGISTER_USER_KERNEL("adaptive_avg_pool1d_grad") \ REGISTER_USER_KERNEL("adaptive_avg_pool1d_grad") \
.SetCreateFn<GpuAdaptiveAvgPool1dGradKernel<device, dtype>>() \ .SetCreateFn<GpuAdaptiveAvgPool1dGradKernel<device, dtype>>() \
.SetIsMatchedHob((HobDeviceType() == device) \ .SetIsMatchedHob((HobDeviceType() == device) \
&& (HobDataType("dx", 0) == GetDataType<dtype>::value)); \ && (HobDataType("dx", 0) == GetDataType<dtype>::value)); \
REGISTER_USER_KERNEL("adaptive_avg_pool2d_grad") \ REGISTER_USER_KERNEL("adaptive_avg_pool2d_grad") \
.SetCreateFn<GpuAdaptiveAvgPool2dGradKernel<device, dtype>>() \ .SetCreateFn<GpuAdaptiveAvgPool2dGradKernel<device, dtype>>() \
.SetIsMatchedHob((HobDeviceType() == device) \ .SetIsMatchedHob((HobDeviceType() == device) \
&& (HobDataType("dx", 0) == GetDataType<dtype>::value)); \ && (HobDataType("dx", 0) == GetDataType<dtype>::value)); \
REGISTER_USER_KERNEL("adaptive_avg_pool3d_grad") \ REGISTER_USER_KERNEL("adaptive_avg_pool3d_grad") \
.SetCreateFn<GpuAdaptiveAvgPool3dGradKernel<device, dtype>>() \ .SetCreateFn<GpuAdaptiveAvgPool3dGradKernel<device, dtype>>() \
.SetIsMatchedHob((HobDeviceType() == device) \ .SetIsMatchedHob((HobDeviceType() == device) \
&& (HobDataType("dx", 0) == GetDataType<dtype>::value)); && (HobDataType("dx", 0) == GetDataType<dtype>::value));
REGISTER_CUDA_ADAPTIVE_AVGPOOL_BACKWARD_KERNEL(DeviceType::kCUDA, float); REGISTER_CUDA_ADAPTIVE_AVGPOOL_BACKWARD_KERNEL(DeviceType::kCUDA, float);
REGISTER_CUDA_ADAPTIVE_AVGPOOL_BACKWARD_KERNEL(DeviceType::kCUDA, double); REGISTER_CUDA_ADAPTIVE_AVGPOOL_BACKWARD_KERNEL(DeviceType::kCUDA, double);
REGISTER_CUDA_ADAPTIVE_AVGPOOL_BACKWARD_KERNEL(DeviceType::kCUDA, int); REGISTER_CUDA_ADAPTIVE_AVGPOOL_BACKWARD_KERNEL(DeviceType::kCUDA, int);
} // namespace user_op } // namespace user_op
} // namespace oneflow } // namespace oneflow
\ No newline at end of file
/* /*
Copyright 2020 The OneFlow Authors. All rights reserved. Copyright 2020 The OneFlow Authors. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License"); Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License. you may not use this file except in compliance with the License.
You may obtain a copy of the License at You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0 http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS, distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. limitations under the License.
*/ */
#include "hip/hip_runtime.h" #include "hip/hip_runtime.h"
#include "oneflow/core/kernel/new_kernel_util.h" #include "oneflow/core/kernel/new_kernel_util.h"
#include "oneflow/core/kernel/kernel_util.h" #include "oneflow/core/kernel/kernel_util.h"
#include "oneflow/core/device/cuda_util.h" #include "oneflow/core/device/cuda_util.h"
#include "affine_grid_kernel.h" #include "affine_grid_kernel.h"
namespace oneflow { namespace oneflow {
namespace { namespace {
template<typename data_type, bool align_corners> template<typename data_type, bool align_corners>
OF_DEVICE_FUNC data_type LinspaceGPU(int32_t index, int32_t num_steps) { OF_DEVICE_FUNC data_type LinspaceGPU(int32_t index, int32_t num_steps) {
if (num_steps <= 1) { return static_cast<data_type>(0.0); } if (num_steps <= 1) { return static_cast<data_type>(0.0); }
if (align_corners) { if (align_corners) {
return static_cast<data_type>(-1.0 + 2.0 / (num_steps - 1) * index); return static_cast<data_type>(-1.0 + 2.0 / (num_steps - 1) * index);
} else { } else {
return static_cast<data_type>((-1.0 + 2.0 / (num_steps - 1) * index) * (num_steps - 1) return static_cast<data_type>((-1.0 + 2.0 / (num_steps - 1) * index) * (num_steps - 1)
/ num_steps); / num_steps);
} }
} }
template<typename data_type, bool align_corners> template<typename data_type, bool align_corners>
__global__ void Generate2DBaseGridGPUKernel(const int32_t nthreads, data_type* grid_ptr, int32_t H, __global__ void Generate2DBaseGridGPUKernel(const int32_t nthreads, data_type* grid_ptr, int32_t H,
int32_t W) { int32_t W) {
CUDA_1D_KERNEL_LOOP(index, nthreads) { CUDA_1D_KERNEL_LOOP(index, nthreads) {
const int32_t h = index / W; const int32_t h = index / W;
const int32_t w = index % W; const int32_t w = index % W;
const int32_t pixel_length = 3; const int32_t pixel_length = 3;
data_type* row_ptr = grid_ptr + h * W * pixel_length; data_type* row_ptr = grid_ptr + h * W * pixel_length;
data_type* pixel_ptr = row_ptr + w * pixel_length; data_type* pixel_ptr = row_ptr + w * pixel_length;
data_type h_value = LinspaceGPU<data_type, align_corners>(h, H); data_type h_value = LinspaceGPU<data_type, align_corners>(h, H);
data_type w_value = LinspaceGPU<data_type, align_corners>(w, W); data_type w_value = LinspaceGPU<data_type, align_corners>(w, W);
pixel_ptr[0] = w_value; pixel_ptr[0] = w_value;
pixel_ptr[1] = h_value; pixel_ptr[1] = h_value;
pixel_ptr[2] = static_cast<data_type>(1.0); pixel_ptr[2] = static_cast<data_type>(1.0);
} }
} }
template<typename data_type, bool align_corners> template<typename data_type, bool align_corners>
__global__ void Generate3DBaseGridGPUKernel(const int32_t nthreads, data_type* grid_ptr, int32_t D, __global__ void Generate3DBaseGridGPUKernel(const int32_t nthreads, data_type* grid_ptr, int32_t D,
int32_t H, int32_t W) { int32_t H, int32_t W) {
CUDA_1D_KERNEL_LOOP(index, nthreads) { CUDA_1D_KERNEL_LOOP(index, nthreads) {
const int32_t d = index / H; const int32_t d = index / H;
const int32_t h = index % H; const int32_t h = index % H;
const int32_t pixel_length = 4; const int32_t pixel_length = 4;
data_type* image_ptr = grid_ptr + d * H * W * pixel_length; data_type* image_ptr = grid_ptr + d * H * W * pixel_length;
data_type* row_ptr = image_ptr + h * W * pixel_length; data_type* row_ptr = image_ptr + h * W * pixel_length;
data_type d_value = LinspaceGPU<data_type, align_corners>(d, D); data_type d_value = LinspaceGPU<data_type, align_corners>(d, D);
data_type h_value = LinspaceGPU<data_type, align_corners>(h, H); data_type h_value = LinspaceGPU<data_type, align_corners>(h, H);
for (int32_t w = 0; w < W; ++w) { for (int32_t w = 0; w < W; ++w) {
data_type* pixel_ptr = row_ptr + w * pixel_length; data_type* pixel_ptr = row_ptr + w * pixel_length;
data_type w_value = LinspaceGPU<data_type, align_corners>(w, W); data_type w_value = LinspaceGPU<data_type, align_corners>(w, W);
pixel_ptr[0] = w_value; pixel_ptr[0] = w_value;
pixel_ptr[1] = h_value; pixel_ptr[1] = h_value;
pixel_ptr[2] = d_value; pixel_ptr[2] = d_value;
pixel_ptr[3] = static_cast<data_type>(1.0); pixel_ptr[3] = static_cast<data_type>(1.0);
} }
} }
} }
} // namespace } // namespace
void GenerateBaseGridImp<DeviceType::kCUDA>::Generate2D(user_op::KernelComputeContext* ctx, void GenerateBaseGridImp<DeviceType::kCUDA>::Generate2D(user_op::KernelComputeContext* ctx,
float* grid_ptr, int64_t H, int64_t W, float* grid_ptr, int64_t H, int64_t W,
bool align_corners) { bool align_corners) {
int count = H * W; int count = H * W;
if (align_corners) { if (align_corners) {
RUN_CUDA_KERNEL((Generate2DBaseGridGPUKernel<float, true>), ctx->stream(), count, count, RUN_CUDA_KERNEL((Generate2DBaseGridGPUKernel<float, true>), ctx->stream(), count, count,
grid_ptr, H, W); grid_ptr, H, W);
} else { } else {
RUN_CUDA_KERNEL((Generate2DBaseGridGPUKernel<float, false>), ctx->stream(), count, count, RUN_CUDA_KERNEL((Generate2DBaseGridGPUKernel<float, false>), ctx->stream(), count, count,
grid_ptr, H, W); grid_ptr, H, W);
} }
} }
void GenerateBaseGridImp<DeviceType::kCUDA>::Generate2D(user_op::KernelComputeContext* ctx, void GenerateBaseGridImp<DeviceType::kCUDA>::Generate2D(user_op::KernelComputeContext* ctx,
double* grid_ptr, int64_t H, int64_t W, double* grid_ptr, int64_t H, int64_t W,
bool align_corners) { bool align_corners) {
int count = H * W; int count = H * W;
if (align_corners) { if (align_corners) {
RUN_CUDA_KERNEL((Generate2DBaseGridGPUKernel<double, true>), ctx->stream(), count, count, RUN_CUDA_KERNEL((Generate2DBaseGridGPUKernel<double, true>), ctx->stream(), count, count,
grid_ptr, H, W); grid_ptr, H, W);
} else { } else {
RUN_CUDA_KERNEL((Generate2DBaseGridGPUKernel<double, false>), ctx->stream(), count, count, RUN_CUDA_KERNEL((Generate2DBaseGridGPUKernel<double, false>), ctx->stream(), count, count,
grid_ptr, H, W); grid_ptr, H, W);
} }
} }
void GenerateBaseGridImp<DeviceType::kCUDA>::Generate3D(user_op::KernelComputeContext* ctx, void GenerateBaseGridImp<DeviceType::kCUDA>::Generate3D(user_op::KernelComputeContext* ctx,
float* grid_ptr, int64_t D, int64_t H, float* grid_ptr, int64_t D, int64_t H,
int64_t W, bool align_corners) { int64_t W, bool align_corners) {
int count = D * H; int count = D * H;
if (align_corners) { if (align_corners) {
RUN_CUDA_KERNEL((Generate3DBaseGridGPUKernel<float, true>), ctx->stream(), count, count, RUN_CUDA_KERNEL((Generate3DBaseGridGPUKernel<float, true>), ctx->stream(), count, count,
grid_ptr, D, H, W); grid_ptr, D, H, W);
} else { } else {
RUN_CUDA_KERNEL((Generate3DBaseGridGPUKernel<float, false>), ctx->stream(), count, count, RUN_CUDA_KERNEL((Generate3DBaseGridGPUKernel<float, false>), ctx->stream(), count, count,
grid_ptr, D, H, W); grid_ptr, D, H, W);
} }
} }
void GenerateBaseGridImp<DeviceType::kCUDA>::Generate3D(user_op::KernelComputeContext* ctx, void GenerateBaseGridImp<DeviceType::kCUDA>::Generate3D(user_op::KernelComputeContext* ctx,
double* grid_ptr, int64_t D, int64_t H, double* grid_ptr, int64_t D, int64_t H,
int64_t W, bool align_corners) { int64_t W, bool align_corners) {
int count = D * H; int count = D * H;
if (align_corners) { if (align_corners) {
RUN_CUDA_KERNEL((Generate3DBaseGridGPUKernel<double, true>), ctx->stream(), count, count, RUN_CUDA_KERNEL((Generate3DBaseGridGPUKernel<double, true>), ctx->stream(), count, count,
grid_ptr, D, H, W); grid_ptr, D, H, W);
} else { } else {
RUN_CUDA_KERNEL((Generate3DBaseGridGPUKernel<double, false>), ctx->stream(), count, count, RUN_CUDA_KERNEL((Generate3DBaseGridGPUKernel<double, false>), ctx->stream(), count, count,
grid_ptr, D, H, W); grid_ptr, D, H, W);
} }
} }
} // namespace oneflow } // namespace oneflow
\ No newline at end of file
/* /*
Copyright 2020 The OneFlow Authors. All rights reserved. Copyright 2020 The OneFlow Authors. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License"); Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License. you may not use this file except in compliance with the License.
You may obtain a copy of the License at You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0 http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS, distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. limitations under the License.
*/ */
#ifdef WITH_ROCM #ifdef WITH_ROCM
#include "hip/hip_runtime.h" #include "hip/hip_runtime.h"
#include "oneflow/core/framework/framework.h" #include "oneflow/core/framework/framework.h"
#include "oneflow/user/kernels/arange_kernel_util.h" #include "oneflow/user/kernels/arange_kernel_util.h"
namespace oneflow { namespace oneflow {
namespace user_op { namespace user_op {
template<typename T> template<typename T>
__global__ void ArangeForwardGpuKernel(const T start, const T delta, const int64_t arange_elem_cnt, __global__ void ArangeForwardGpuKernel(const T start, const T delta, const int64_t arange_elem_cnt,
T* out) { T* out) {
// Use Loop to set the value // Use Loop to set the value
DoArange<T>(start, delta, arange_elem_cnt, out); DoArange<T>(start, delta, arange_elem_cnt, out);
} }
template<typename T> template<typename T>
struct ArangeFunctor<DeviceType::kCUDA, T> final { struct ArangeFunctor<DeviceType::kCUDA, T> final {
void operator()(ep::Stream* stream, const T start, const T delta, const int64_t arange_elem_cnt, void operator()(ep::Stream* stream, const T start, const T delta, const int64_t arange_elem_cnt,
T* out) { T* out) {
// The thread num is set as arange_elem_cnt // The thread num is set as arange_elem_cnt
RUN_CUDA_KERNEL((ArangeForwardGpuKernel<T>), stream, arange_elem_cnt, start, delta, RUN_CUDA_KERNEL((ArangeForwardGpuKernel<T>), stream, arange_elem_cnt, start, delta,
arange_elem_cnt, out); arange_elem_cnt, out);
} }
}; };
OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INSTANTIATE_ARANGE_FUNCTOR, (DeviceType::kCUDA), OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INSTANTIATE_ARANGE_FUNCTOR, (DeviceType::kCUDA),
ARANGE_DATA_TYPE_SEQ); ARANGE_DATA_TYPE_SEQ);
} // namespace user_op } // namespace user_op
} // namespace oneflow } // namespace oneflow
#endif // End WITH_ROCM #endif // End WITH_ROCM
\ No newline at end of file
/* /*
Copyright 2020 The OneFlow Authors. All rights reserved. Copyright 2020 The OneFlow Authors. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License"); Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License. you may not use this file except in compliance with the License.
You may obtain a copy of the License at You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0 http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS, distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. limitations under the License.
*/ */
#include "hip/hip_runtime.h" #include "hip/hip_runtime.h"
#include "oneflow/core/framework/framework.h" #include "oneflow/core/framework/framework.h"
#include "oneflow/core/kernel/new_kernel_util.h" #include "oneflow/core/kernel/new_kernel_util.h"
#include "oneflow/user/kernels/radix_sort.hip.h" #include "oneflow/user/kernels/radix_sort.hip.h"
#include "oneflow/core/ep/rocm/cuda_stream.h" #include "oneflow/core/ep/rocm/cuda_stream.h"
namespace oneflow { namespace oneflow {
namespace { namespace {
template<typename T> template<typename T>
class TmpBufferManager final { class TmpBufferManager final {
public: public:
OF_DISALLOW_COPY_AND_MOVE(TmpBufferManager); OF_DISALLOW_COPY_AND_MOVE(TmpBufferManager);
TmpBufferManager(int32_t capacity, void* ptr, const ShapeView& in_shape) TmpBufferManager(int32_t capacity, void* ptr, const ShapeView& in_shape)
: capacity_{capacity}, : capacity_{capacity},
sorted_in_elem_cnt_{in_shape.elem_cnt()}, sorted_in_elem_cnt_{in_shape.elem_cnt()},
indices_elem_cnt_{sorted_in_elem_cnt_} { indices_elem_cnt_{sorted_in_elem_cnt_} {
const int32_t sorted_in_aligned_bytes = GetCudaAlignedSize(sorted_in_elem_cnt_ * sizeof(T)); const int32_t sorted_in_aligned_bytes = GetCudaAlignedSize(sorted_in_elem_cnt_ * sizeof(T));
const int32_t indices_aligned_bytes = GetCudaAlignedSize(indices_elem_cnt_ * sizeof(int32_t)); const int32_t indices_aligned_bytes = GetCudaAlignedSize(indices_elem_cnt_ * sizeof(int32_t));
sorted_in_ptr_ = reinterpret_cast<T*>(ptr); sorted_in_ptr_ = reinterpret_cast<T*>(ptr);
indices_ptr_ = reinterpret_cast<int32_t*>(reinterpret_cast<char*>(sorted_in_ptr_) indices_ptr_ = reinterpret_cast<int32_t*>(reinterpret_cast<char*>(sorted_in_ptr_)
+ sorted_in_aligned_bytes); + sorted_in_aligned_bytes);
temp_storage_ptr_ = temp_storage_ptr_ =
reinterpret_cast<void*>(reinterpret_cast<char*>(indices_ptr_) + indices_aligned_bytes); reinterpret_cast<void*>(reinterpret_cast<char*>(indices_ptr_) + indices_aligned_bytes);
temp_storage_bytes_ = capacity_ - sorted_in_aligned_bytes - indices_aligned_bytes; temp_storage_bytes_ = capacity_ - sorted_in_aligned_bytes - indices_aligned_bytes;
CHECK_GE(temp_storage_bytes_, 0); CHECK_GE(temp_storage_bytes_, 0);
} }
~TmpBufferManager() = default; ~TmpBufferManager() = default;
T* SortedInPtr() const { return sorted_in_ptr_; } T* SortedInPtr() const { return sorted_in_ptr_; }
int32_t* IndicesPtr() const { return indices_ptr_; } int32_t* IndicesPtr() const { return indices_ptr_; }
void* TempStoragePtr() const { return temp_storage_ptr_; } void* TempStoragePtr() const { return temp_storage_ptr_; }
int32_t TempStorageBytes() const { return temp_storage_bytes_; } int32_t TempStorageBytes() const { return temp_storage_bytes_; }
private: private:
int32_t capacity_; int32_t capacity_;
T* sorted_in_ptr_; T* sorted_in_ptr_;
int32_t* indices_ptr_; int32_t* indices_ptr_;
void* temp_storage_ptr_; void* temp_storage_ptr_;
int64_t sorted_in_elem_cnt_; int64_t sorted_in_elem_cnt_;
int64_t indices_elem_cnt_; int64_t indices_elem_cnt_;
int32_t temp_storage_bytes_; int32_t temp_storage_bytes_;
}; };
__global__ void InitializeIndices(int32_t elem_cnt, int32_t* indices_ptr, int32_t instance_size) { __global__ void InitializeIndices(int32_t elem_cnt, int32_t* indices_ptr, int32_t instance_size) {
CUDA_1D_KERNEL_LOOP(i, elem_cnt) { indices_ptr[i] = i % instance_size; }; CUDA_1D_KERNEL_LOOP(i, elem_cnt) { indices_ptr[i] = i % instance_size; };
} }
} // namespace } // namespace
template<typename T> template<typename T>
class GpuArgSortKernel final : public user_op::OpKernel { class GpuArgSortKernel final : public user_op::OpKernel {
public: public:
GpuArgSortKernel() = default; GpuArgSortKernel() = default;
~GpuArgSortKernel() = default; ~GpuArgSortKernel() = default;
private: private:
using user_op::OpKernel::Compute; using user_op::OpKernel::Compute;
void Compute(user_op::KernelComputeContext* ctx) const override { void Compute(user_op::KernelComputeContext* ctx) const override {
const user_op::Tensor* in = ctx->Tensor4ArgNameAndIndex("in", 0); const user_op::Tensor* in = ctx->Tensor4ArgNameAndIndex("in", 0);
user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0); user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0);
user_op::Tensor* tmp_buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0); user_op::Tensor* tmp_buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0);
TmpBufferManager<T> buf_manager(static_cast<int32_t>(tmp_buffer->shape_view().elem_cnt()), TmpBufferManager<T> buf_manager(static_cast<int32_t>(tmp_buffer->shape_view().elem_cnt()),
tmp_buffer->mut_dptr<void>(), in->shape_view()); tmp_buffer->mut_dptr<void>(), in->shape_view());
const int32_t elem_cnt = in->shape_view().elem_cnt(); const int32_t elem_cnt = in->shape_view().elem_cnt();
const int32_t instance_size = in->shape_view().At(in->shape_view().NumAxes() - 1); const int32_t instance_size = in->shape_view().At(in->shape_view().NumAxes() - 1);
const int32_t instance_num = elem_cnt / instance_size; const int32_t instance_num = elem_cnt / instance_size;
const std::string& direction = ctx->Attr<std::string>("direction"); const std::string& direction = ctx->Attr<std::string>("direction");
InitializeIndices<<<BlocksNum4ThreadsNum(elem_cnt), kCudaThreadsNumPerBlock, 0, InitializeIndices<<<BlocksNum4ThreadsNum(elem_cnt), kCudaThreadsNumPerBlock, 0,
ctx->stream()->As<ep::CudaStream>()->cuda_stream()>>>( ctx->stream()->As<ep::CudaStream>()->cuda_stream()>>>(
elem_cnt, buf_manager.IndicesPtr(), instance_size); elem_cnt, buf_manager.IndicesPtr(), instance_size);
if (direction == "ASCENDING") { if (direction == "ASCENDING") {
SortPairsAscending(in->dptr<T>(), buf_manager.IndicesPtr(), instance_num, instance_size, SortPairsAscending(in->dptr<T>(), buf_manager.IndicesPtr(), instance_num, instance_size,
buf_manager.TempStoragePtr(), buf_manager.TempStorageBytes(), buf_manager.TempStoragePtr(), buf_manager.TempStorageBytes(),
buf_manager.SortedInPtr(), out->mut_dptr<int32_t>(), buf_manager.SortedInPtr(), out->mut_dptr<int32_t>(),
ctx->stream()->As<ep::CudaStream>()->cuda_stream()); ctx->stream()->As<ep::CudaStream>()->cuda_stream());
} else if (direction == "DESCENDING") { } else if (direction == "DESCENDING") {
SortPairsDescending(in->dptr<T>(), buf_manager.IndicesPtr(), instance_num, instance_size, SortPairsDescending(in->dptr<T>(), buf_manager.IndicesPtr(), instance_num, instance_size,
buf_manager.TempStoragePtr(), buf_manager.TempStorageBytes(), buf_manager.TempStoragePtr(), buf_manager.TempStorageBytes(),
buf_manager.SortedInPtr(), out->mut_dptr<int32_t>(), buf_manager.SortedInPtr(), out->mut_dptr<int32_t>(),
ctx->stream()->As<ep::CudaStream>()->cuda_stream()); ctx->stream()->As<ep::CudaStream>()->cuda_stream());
} else { } else {
UNIMPLEMENTED(); UNIMPLEMENTED();
} }
} }
bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
}; };
#define REGISTER_CUDA_ARG_SORT_KERNEL(dtype) \ #define REGISTER_CUDA_ARG_SORT_KERNEL(dtype) \
REGISTER_USER_KERNEL("arg_sort") \ REGISTER_USER_KERNEL("arg_sort") \
.SetCreateFn<GpuArgSortKernel<dtype>>() \ .SetCreateFn<GpuArgSortKernel<dtype>>() \
.SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \
&& (user_op::HobDataType("in", 0) == GetDataType<dtype>::value)) \ && (user_op::HobDataType("in", 0) == GetDataType<dtype>::value)) \
.SetInferTmpSizeFn([](user_op::InferContext* ctx) { \ .SetInferTmpSizeFn([](user_op::InferContext* ctx) { \
const Shape& in_shape = ctx->InputShape("in", 0); \ const Shape& in_shape = ctx->InputShape("in", 0); \
const int32_t elem_cnt = in_shape.elem_cnt(); \ const int32_t elem_cnt = in_shape.elem_cnt(); \
const int32_t instance_size = in_shape.dim_vec().back(); \ const int32_t instance_size = in_shape.dim_vec().back(); \
const int32_t instance_num = elem_cnt / instance_size; \ const int32_t instance_num = elem_cnt / instance_size; \
\ \
/* Sorted In */ \ /* Sorted In */ \
const int32_t sorted_in_aligned_bytes = GetCudaAlignedSize(elem_cnt * sizeof(dtype)); \ const int32_t sorted_in_aligned_bytes = GetCudaAlignedSize(elem_cnt * sizeof(dtype)); \
/* Indices */ \ /* Indices */ \
const int32_t indices_aligned_bytes = GetCudaAlignedSize(elem_cnt * sizeof(int32_t)); \ const int32_t indices_aligned_bytes = GetCudaAlignedSize(elem_cnt * sizeof(int32_t)); \
/* CUB Temp Storage */ \ /* CUB Temp Storage */ \
int32_t temp_storage_bytes = -1; \ int32_t temp_storage_bytes = -1; \
const std::string& direction = ctx->Attr<std::string>("direction"); \ const std::string& direction = ctx->Attr<std::string>("direction"); \
if (direction == "ASCENDING") { \ if (direction == "ASCENDING") { \
temp_storage_bytes = \ temp_storage_bytes = \
InferTempStorageForSortPairsAscending<dtype, int32_t>(instance_num, instance_size); \ InferTempStorageForSortPairsAscending<dtype, int32_t>(instance_num, instance_size); \
} else if (direction == "DESCENDING") { \ } else if (direction == "DESCENDING") { \
temp_storage_bytes = \ temp_storage_bytes = \
InferTempStorageForSortPairsDescending<dtype, int32_t>(instance_num, instance_size); \ InferTempStorageForSortPairsDescending<dtype, int32_t>(instance_num, instance_size); \
} else { \ } else { \
UNIMPLEMENTED(); \ UNIMPLEMENTED(); \
} \ } \
\ \
return sorted_in_aligned_bytes + indices_aligned_bytes + temp_storage_bytes; \ return sorted_in_aligned_bytes + indices_aligned_bytes + temp_storage_bytes; \
}); });
REGISTER_CUDA_ARG_SORT_KERNEL(float) REGISTER_CUDA_ARG_SORT_KERNEL(float)
REGISTER_CUDA_ARG_SORT_KERNEL(double) REGISTER_CUDA_ARG_SORT_KERNEL(double)
REGISTER_CUDA_ARG_SORT_KERNEL(bool) REGISTER_CUDA_ARG_SORT_KERNEL(bool)
REGISTER_CUDA_ARG_SORT_KERNEL(int8_t) REGISTER_CUDA_ARG_SORT_KERNEL(int8_t)
REGISTER_CUDA_ARG_SORT_KERNEL(uint8_t) REGISTER_CUDA_ARG_SORT_KERNEL(uint8_t)
REGISTER_CUDA_ARG_SORT_KERNEL(int32_t) REGISTER_CUDA_ARG_SORT_KERNEL(int32_t)
REGISTER_CUDA_ARG_SORT_KERNEL(int64_t) REGISTER_CUDA_ARG_SORT_KERNEL(int64_t)
} // namespace oneflow } // namespace oneflow
\ No newline at end of file
/* /*
Copyright 2020 The OneFlow Authors. All rights reserved. Copyright 2020 The OneFlow Authors. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License"); Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License. you may not use this file except in compliance with the License.
You may obtain a copy of the License at You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0 http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS, distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. limitations under the License.
*/ */
#include "hip/hip_runtime.h" #include "hip/hip_runtime.h"
#include "oneflow/user/kernels/arg_where_kernel_util.h" #include "oneflow/user/kernels/arg_where_kernel_util.h"
#include "oneflow/core/common/nd_index_offset_helper.h" #include "oneflow/core/common/nd_index_offset_helper.h"
#include "oneflow/core/common/small_vector.h" #include "oneflow/core/common/small_vector.h"
#include "oneflow/core/hip/elementwise.hip.h" #include "oneflow/core/hip/elementwise.hip.h"
#include "oneflow/core/kernel/kernel_util.h" #include "oneflow/core/kernel/kernel_util.h"
#include "oneflow/core/ep/rocm/cuda_stream.h" #include "oneflow/core/ep/rocm/cuda_stream.h"
#include <hipcub/hipcub.hpp> #include <hipcub/hipcub.hpp>
namespace oneflow { namespace oneflow {
namespace { namespace {
constexpr int kBlockSize = cuda::elementwise::kBlockSize; constexpr int kBlockSize = cuda::elementwise::kBlockSize;
int GetNumBlocks(int64_t elem_cnt) { int GetNumBlocks(int64_t elem_cnt) {
int num_blocks = 0; int num_blocks = 0;
OF_CUDA_CHECK(cuda::elementwise::GetNumBlocks(elem_cnt, &num_blocks)); OF_CUDA_CHECK(cuda::elementwise::GetNumBlocks(elem_cnt, &num_blocks));
return num_blocks; return num_blocks;
} }
template<typename T, int NDIM> template<typename T, int NDIM>
struct StrideIterator { struct StrideIterator {
typedef StrideIterator self_type; typedef StrideIterator self_type;
typedef std::ptrdiff_t difference_type; typedef std::ptrdiff_t difference_type;
typedef T value_type; typedef T value_type;
typedef T* pointer; typedef T* pointer;
typedef T& reference; typedef T& reference;
typedef std::random_access_iterator_tag iterator_category; typedef std::random_access_iterator_tag iterator_category;
explicit StrideIterator(T* ptr, size_t max_iters) : ptr_(ptr), max_iters_(max_iters) {} explicit StrideIterator(T* ptr, size_t max_iters) : ptr_(ptr), max_iters_(max_iters) {}
OF_DEVICE_FUNC reference operator[](int i) { OF_DEVICE_FUNC reference operator[](int i) {
assert(0 <= i && i < max_iters_); assert(0 <= i && i < max_iters_);
return *(ptr_ + (i * NDIM)); return *(ptr_ + (i * NDIM));
} }
private: private:
T* ptr_; T* ptr_;
size_t max_iters_; size_t max_iters_;
}; };
template<typename T, int NDIM> template<typename T, int NDIM>
__global__ void __launch_bounds__(kBlockSize) __global__ void __launch_bounds__(kBlockSize)
CudaOffsetToNdIndexInplace(NdIndexOffsetHelper<T, NDIM> index_converter, CudaOffsetToNdIndexInplace(NdIndexOffsetHelper<T, NDIM> index_converter,
const T* output_size_ptr, T* output_ptr) { const T* output_size_ptr, T* output_ptr) {
CUDA_1D_KERNEL_LOOP_T(T, i, *output_size_ptr) { CUDA_1D_KERNEL_LOOP_T(T, i, *output_size_ptr) {
T* index_ptr = output_ptr + i * NDIM; T* index_ptr = output_ptr + i * NDIM;
index_converter.OffsetToNdIndex(*index_ptr, index_ptr); index_converter.OffsetToNdIndex(*index_ptr, index_ptr);
} }
} }
template<typename T> template<typename T>
struct IsTrue { struct IsTrue {
__device__ __forceinline__ bool operator()(const T& val) const { return static_cast<bool>(val); } __device__ __forceinline__ bool operator()(const T& val) const { return static_cast<bool>(val); }
}; };
template<typename IN_T, typename OUT_T, typename OUT_ITER> template<typename IN_T, typename OUT_T, typename OUT_ITER>
hipError_t SelectTrue(hipStream_t stream, int num_items, void* temp_storage, hipError_t SelectTrue(hipStream_t stream, int num_items, void* temp_storage,
size_t& temp_storage_bytes, const IN_T* input, OUT_ITER output_iter, size_t& temp_storage_bytes, const IN_T* input, OUT_ITER output_iter,
OUT_T* num_selected) { OUT_T* num_selected) {
IsTrue<IN_T> is_true; IsTrue<IN_T> is_true;
hipcub::TransformInputIterator<bool, IsTrue<IN_T>, const IN_T*> flag_iter(input, is_true); hipcub::TransformInputIterator<bool, IsTrue<IN_T>, const IN_T*> flag_iter(input, is_true);
hipcub::CountingInputIterator<OUT_T> offset_counter(0); hipcub::CountingInputIterator<OUT_T> offset_counter(0);
return hipcub::DeviceSelect::Flagged(temp_storage, temp_storage_bytes, offset_counter, flag_iter, return hipcub::DeviceSelect::Flagged(temp_storage, temp_storage_bytes, offset_counter, flag_iter,
output_iter, num_selected, num_items, stream, false); output_iter, num_selected, num_items, stream, false);
} }
} // namespace } // namespace
template<typename IN_T, typename OUT_T, int NDIM> template<typename IN_T, typename OUT_T, int NDIM>
struct ArgWhereKernelUtil<DeviceType::kCUDA, IN_T, OUT_T, NDIM> { struct ArgWhereKernelUtil<DeviceType::kCUDA, IN_T, OUT_T, NDIM> {
static void ArgWhere(ep::Stream* stream, const ShapeView& input_shape, const IN_T* input_ptr, static void ArgWhere(ep::Stream* stream, const ShapeView& input_shape, const IN_T* input_ptr,
void* temp_storage, size_t temp_storage_bytes, OUT_T* output_ptr, void* temp_storage, size_t temp_storage_bytes, OUT_T* output_ptr,
OUT_T* output_size_ptr) { OUT_T* output_size_ptr) {
const int64_t elem_cnt = input_shape.elem_cnt(); const int64_t elem_cnt = input_shape.elem_cnt();
// deal with empty blob // deal with empty blob
if (elem_cnt == 0) { if (elem_cnt == 0) {
Memset<DeviceType::kCUDA>(stream, output_size_ptr, 0, sizeof(OUT_T)); Memset<DeviceType::kCUDA>(stream, output_size_ptr, 0, sizeof(OUT_T));
return; return;
} }
CHECK_NOTNULL(stream); CHECK_NOTNULL(stream);
CHECK_LE(elem_cnt, std::numeric_limits<OUT_T>::max()); CHECK_LE(elem_cnt, std::numeric_limits<OUT_T>::max());
size_t workspace = GetWorkspaceBytesSize(stream, elem_cnt); size_t workspace = GetWorkspaceBytesSize(stream, elem_cnt);
CHECK_LE(workspace, temp_storage_bytes); CHECK_LE(workspace, temp_storage_bytes);
if (NDIM == 1) { if (NDIM == 1) {
OF_CUDA_CHECK((SelectTrue<IN_T, OUT_T, OUT_T*>( OF_CUDA_CHECK((SelectTrue<IN_T, OUT_T, OUT_T*>(
stream->As<ep::CudaStream>()->cuda_stream(), input_shape.elem_cnt(), temp_storage, stream->As<ep::CudaStream>()->cuda_stream(), input_shape.elem_cnt(), temp_storage,
workspace, input_ptr, output_ptr, output_size_ptr))); workspace, input_ptr, output_ptr, output_size_ptr)));
} else { } else {
using OutputIterator = StrideIterator<OUT_T, NDIM>; using OutputIterator = StrideIterator<OUT_T, NDIM>;
OutputIterator output_iter(output_ptr, elem_cnt); OutputIterator output_iter(output_ptr, elem_cnt);
OF_CUDA_CHECK((SelectTrue<IN_T, OUT_T, OutputIterator>( OF_CUDA_CHECK((SelectTrue<IN_T, OUT_T, OutputIterator>(
stream->As<ep::CudaStream>()->cuda_stream(), elem_cnt, temp_storage, workspace, input_ptr, stream->As<ep::CudaStream>()->cuda_stream(), elem_cnt, temp_storage, workspace, input_ptr,
output_iter, output_size_ptr))); output_iter, output_size_ptr)));
OUT_T dims[NDIM] = {0}; OUT_T dims[NDIM] = {0};
std::transform(input_shape.ptr(), input_shape.ptr() + input_shape.NumAxes(), dims, std::transform(input_shape.ptr(), input_shape.ptr() + input_shape.NumAxes(), dims,
[](int64_t dim) { return static_cast<OUT_T>(dim); }); [](int64_t dim) { return static_cast<OUT_T>(dim); });
NdIndexOffsetHelper<OUT_T, NDIM> index_converter(dims); NdIndexOffsetHelper<OUT_T, NDIM> index_converter(dims);
CudaOffsetToNdIndexInplace<OUT_T, NDIM> CudaOffsetToNdIndexInplace<OUT_T, NDIM>
<<<GetNumBlocks(elem_cnt), kBlockSize, 0, stream->As<ep::CudaStream>()->cuda_stream()>>>( <<<GetNumBlocks(elem_cnt), kBlockSize, 0, stream->As<ep::CudaStream>()->cuda_stream()>>>(
index_converter, output_size_ptr, output_ptr); index_converter, output_size_ptr, output_ptr);
} }
} }
static size_t GetWorkspaceBytesSize(ep::Stream* stream, int64_t elem_cnt) { static size_t GetWorkspaceBytesSize(ep::Stream* stream, int64_t elem_cnt) {
hipStream_t cuda_stream = stream ? stream->As<ep::CudaStream>()->cuda_stream() : 0; hipStream_t cuda_stream = stream ? stream->As<ep::CudaStream>()->cuda_stream() : 0;
size_t workspace = 0; size_t workspace = 0;
if (NDIM == 1) { if (NDIM == 1) {
OF_CUDA_CHECK((SelectTrue<IN_T, OUT_T, OUT_T*>(cuda_stream, elem_cnt, nullptr, workspace, OF_CUDA_CHECK((SelectTrue<IN_T, OUT_T, OUT_T*>(cuda_stream, elem_cnt, nullptr, workspace,
nullptr, nullptr, nullptr))); nullptr, nullptr, nullptr)));
} else { } else {
using OutputIterator = StrideIterator<OUT_T, NDIM>; using OutputIterator = StrideIterator<OUT_T, NDIM>;
OutputIterator output_iter(nullptr, elem_cnt); OutputIterator output_iter(nullptr, elem_cnt);
OF_CUDA_CHECK((SelectTrue<IN_T, OUT_T, OutputIterator>( OF_CUDA_CHECK((SelectTrue<IN_T, OUT_T, OutputIterator>(
cuda_stream, elem_cnt, nullptr, workspace, nullptr, output_iter, nullptr))); cuda_stream, elem_cnt, nullptr, workspace, nullptr, output_iter, nullptr)));
} }
return workspace; return workspace;
} }
}; };
INSTANTIATE_ARG_WHERE_KERNEL_UTIL_FOR_DEVICE(DeviceType::kCUDA) INSTANTIATE_ARG_WHERE_KERNEL_UTIL_FOR_DEVICE(DeviceType::kCUDA)
} // namespace oneflow } // namespace oneflow
\ No newline at end of file
/* /*
Copyright 2020 The OneFlow Authors. All rights reserved. Copyright 2020 The OneFlow Authors. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License"); Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License. you may not use this file except in compliance with the License.
You may obtain a copy of the License at You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0 http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS, distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. limitations under the License.
*/ */
#include "hip/hip_runtime.h" #include "hip/hip_runtime.h"
#include "oneflow/core/framework/framework.h" #include "oneflow/core/framework/framework.h"
#include <hipcub/hipcub.hpp> #include <hipcub/hipcub.hpp>
#include "oneflow/core/ep/rocm/cuda_stream.h" #include "oneflow/core/ep/rocm/cuda_stream.h"
namespace oneflow { namespace oneflow {
namespace { namespace {
template<typename T> template<typename T>
class TmpBufferManager final { class TmpBufferManager final {
public: public:
OF_DISALLOW_COPY_AND_MOVE(TmpBufferManager); OF_DISALLOW_COPY_AND_MOVE(TmpBufferManager);
TmpBufferManager(int32_t capacity, void* ptr, int32_t instance_num) TmpBufferManager(int32_t capacity, void* ptr, int32_t instance_num)
: capacity_{capacity}, key_value_out_elem_cnt_{instance_num} { : capacity_{capacity}, key_value_out_elem_cnt_{instance_num} {
const int32_t key_value_out_aligned_bytes = const int32_t key_value_out_aligned_bytes =
GetCudaAlignedSize(key_value_out_elem_cnt_ * sizeof(hipcub::KeyValuePair<int32_t, T>)); GetCudaAlignedSize(key_value_out_elem_cnt_ * sizeof(hipcub::KeyValuePair<int32_t, T>));
key_value_out_ptr_ = reinterpret_cast<hipcub::KeyValuePair<int32_t, T>*>(ptr); key_value_out_ptr_ = reinterpret_cast<hipcub::KeyValuePair<int32_t, T>*>(ptr);
temp_storage_ptr_ = reinterpret_cast<void*>(reinterpret_cast<char*>(key_value_out_ptr_) temp_storage_ptr_ = reinterpret_cast<void*>(reinterpret_cast<char*>(key_value_out_ptr_)
+ key_value_out_aligned_bytes); + key_value_out_aligned_bytes);
temp_storage_bytes_ = capacity_ - key_value_out_aligned_bytes; temp_storage_bytes_ = capacity_ - key_value_out_aligned_bytes;
CHECK_GE(temp_storage_bytes_, 0); CHECK_GE(temp_storage_bytes_, 0);
} }
~TmpBufferManager() = default; ~TmpBufferManager() = default;
hipcub::KeyValuePair<int32_t, T>* KeyValueOutPtr() const { return key_value_out_ptr_; } hipcub::KeyValuePair<int32_t, T>* KeyValueOutPtr() const { return key_value_out_ptr_; }
void* TempStoragePtr() const { return temp_storage_ptr_; } void* TempStoragePtr() const { return temp_storage_ptr_; }
int32_t TempStorageBytes() const { return temp_storage_bytes_; } int32_t TempStorageBytes() const { return temp_storage_bytes_; }
private: private:
int32_t capacity_; int32_t capacity_;
hipcub::KeyValuePair<int32_t, T>* key_value_out_ptr_; hipcub::KeyValuePair<int32_t, T>* key_value_out_ptr_;
void* temp_storage_ptr_; void* temp_storage_ptr_;
int32_t key_value_out_elem_cnt_; int32_t key_value_out_elem_cnt_;
int32_t temp_storage_bytes_; int32_t temp_storage_bytes_;
}; };
class MultiplyFunctor final { class MultiplyFunctor final {
public: public:
MultiplyFunctor(int32_t num_col) : num_col_(num_col) {} MultiplyFunctor(int32_t num_col) : num_col_(num_col) {}
__host__ __device__ __forceinline__ int32_t operator()(int32_t idx) const { __host__ __device__ __forceinline__ int32_t operator()(int32_t idx) const {
return idx * num_col_; return idx * num_col_;
} }
private: private:
int32_t num_col_; int32_t num_col_;
}; };
template<typename T> template<typename T>
size_t InferTempStorageForArgMax(int32_t num_row, int32_t num_col) { size_t InferTempStorageForArgMax(int32_t num_row, int32_t num_col) {
using SegmentOffsetIter = using SegmentOffsetIter =
hipcub::TransformInputIterator<int32_t, MultiplyFunctor, hipcub::CountingInputIterator<int32_t>>; hipcub::TransformInputIterator<int32_t, MultiplyFunctor, hipcub::CountingInputIterator<int32_t>>;
hipcub::CountingInputIterator<int32_t> counting_iter(0); hipcub::CountingInputIterator<int32_t> counting_iter(0);
MultiplyFunctor multiply_functor(num_col); MultiplyFunctor multiply_functor(num_col);
SegmentOffsetIter segment_offset_iter(counting_iter, multiply_functor); SegmentOffsetIter segment_offset_iter(counting_iter, multiply_functor);
size_t temp_storage_bytes = 0; size_t temp_storage_bytes = 0;
auto err = auto err =
hipcub::DeviceSegmentedReduce::ArgMax<T*, hipcub::KeyValuePair<int32_t, T>*, SegmentOffsetIter>( hipcub::DeviceSegmentedReduce::ArgMax<T*, hipcub::KeyValuePair<int32_t, T>*, SegmentOffsetIter>(
/* d_temp_storage */ nullptr, /* temp_storage_bytes */ temp_storage_bytes, /* d_temp_storage */ nullptr, /* temp_storage_bytes */ temp_storage_bytes,
/* d_in */ nullptr, /* d_out */ nullptr, /* num_segments */ num_row, /* d_in */ nullptr, /* d_out */ nullptr, /* num_segments */ num_row,
/* d_begin_offsets */ segment_offset_iter, /* d_end_offsets */ segment_offset_iter + 1, /* d_begin_offsets */ segment_offset_iter, /* d_end_offsets */ segment_offset_iter + 1,
/* stream */ 0); /* stream */ 0);
// auto err = // auto err =
// hipcub::DeviceReduce::ArgMax<T*, hipcub::KeyValuePair<int32_t, T>*>( // hipcub::DeviceReduce::ArgMax<T*, hipcub::KeyValuePair<int32_t, T>*>(
// nullptr, temp_storage_bytes, // nullptr, temp_storage_bytes,
// nullptr, nullptr, num_row, // nullptr, nullptr, num_row,
// 0); // 0);
OF_CUDA_CHECK(err); OF_CUDA_CHECK(err);
return temp_storage_bytes; return temp_storage_bytes;
} }
template<typename T> template<typename T>
void ArgMax(const T* in_ptr, int32_t num_row, int32_t num_col, void* temp_storage_ptr, void ArgMax(const T* in_ptr, int32_t num_row, int32_t num_col, void* temp_storage_ptr,
int32_t temp_storage_bytes, hipcub::KeyValuePair<int32_t, T>* out_ptr, int32_t temp_storage_bytes, hipcub::KeyValuePair<int32_t, T>* out_ptr,
hipStream_t stream) { hipStream_t stream) {
size_t rt_inferred_temp_storage_bytes = InferTempStorageForArgMax<T>(num_row, num_col); size_t rt_inferred_temp_storage_bytes = InferTempStorageForArgMax<T>(num_row, num_col);
CHECK_LE(rt_inferred_temp_storage_bytes, temp_storage_bytes); CHECK_LE(rt_inferred_temp_storage_bytes, temp_storage_bytes);
using SegmentOffsetIter = using SegmentOffsetIter =
hipcub::TransformInputIterator<int32_t, MultiplyFunctor, hipcub::CountingInputIterator<int32_t>>; hipcub::TransformInputIterator<int32_t, MultiplyFunctor, hipcub::CountingInputIterator<int32_t>>;
hipcub::CountingInputIterator<int32_t> counting_iter(0); hipcub::CountingInputIterator<int32_t> counting_iter(0);
MultiplyFunctor multiply_functor(num_col); MultiplyFunctor multiply_functor(num_col);
SegmentOffsetIter segment_offset_iter(counting_iter, multiply_functor); SegmentOffsetIter segment_offset_iter(counting_iter, multiply_functor);
// void * d_temp_storage = nullptr; // void * d_temp_storage = nullptr;
// hipMalloc((void **)&d_temp_storage, rt_inferred_temp_storage_bytes); // hipMalloc((void **)&d_temp_storage, rt_inferred_temp_storage_bytes);
auto err = hipcub::DeviceSegmentedReduce::ArgMax( auto err = hipcub::DeviceSegmentedReduce::ArgMax(
/* d_temp_storage */ temp_storage_ptr, /* d_temp_storage */ temp_storage_ptr,
/* temp_storage_bytes */ rt_inferred_temp_storage_bytes, /* temp_storage_bytes */ rt_inferred_temp_storage_bytes,
/* d_in */ in_ptr, /* d_in */ in_ptr,
/* d_out */ out_ptr, /* d_out */ out_ptr,
/* num_segments */ num_row, /* num_segments */ num_row,
/* d_begin_offsets */ segment_offset_iter, /* d_begin_offsets */ segment_offset_iter,
/* d_end_offsets */ segment_offset_iter + 1, /* d_end_offsets */ segment_offset_iter + 1,
/* stream */ stream); /* stream */ stream);
// auto err = // auto err =
// hipcub::DeviceReduce::ArgMax( // hipcub::DeviceReduce::ArgMax(
// d_temp_storage, rt_inferred_temp_storage_bytes, // d_temp_storage, rt_inferred_temp_storage_bytes,
// in_ptr, out_ptr, num_row, // in_ptr, out_ptr, num_row,
// stream); // stream);
OF_CUDA_CHECK(err); OF_CUDA_CHECK(err);
} }
template<typename T> template<typename T>
__global__ void WriteKeysToOutput(const int32_t instance_num, const int32_t instance_size, __global__ void WriteKeysToOutput(const int32_t instance_num, const int32_t instance_size,
const hipcub::KeyValuePair<int32_t, T>* key_value_out_ptr, const hipcub::KeyValuePair<int32_t, T>* key_value_out_ptr,
int64_t* out_ptr) { int64_t* out_ptr) {
CUDA_1D_KERNEL_LOOP(i, instance_num) { out_ptr[i] = key_value_out_ptr[i].key % instance_size; } CUDA_1D_KERNEL_LOOP(i, instance_num) { out_ptr[i] = key_value_out_ptr[i].key % instance_size; }
} }
} // namespace } // namespace
template<typename T> template<typename T>
class GpuArgMaxKernel final : public user_op::OpKernel { class GpuArgMaxKernel final : public user_op::OpKernel {
public: public:
GpuArgMaxKernel() = default; GpuArgMaxKernel() = default;
~GpuArgMaxKernel() = default; ~GpuArgMaxKernel() = default;
private: private:
using user_op::OpKernel::Compute; using user_op::OpKernel::Compute;
void Compute(user_op::KernelComputeContext* ctx) const override { void Compute(user_op::KernelComputeContext* ctx) const override {
const user_op::Tensor* in = ctx->Tensor4ArgNameAndIndex("in", 0); const user_op::Tensor* in = ctx->Tensor4ArgNameAndIndex("in", 0);
user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0); user_op::Tensor* out = ctx->Tensor4ArgNameAndIndex("out", 0);
user_op::Tensor* tmp_buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0); user_op::Tensor* tmp_buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0);
const int32_t elem_cnt = in->shape_view().elem_cnt(); const int32_t elem_cnt = in->shape_view().elem_cnt();
const int32_t instance_size = in->shape_view().At(in->shape_view().NumAxes() - 1); const int32_t instance_size = in->shape_view().At(in->shape_view().NumAxes() - 1);
const int32_t instance_num = elem_cnt / instance_size; const int32_t instance_num = elem_cnt / instance_size;
TmpBufferManager<T> buffer_manager(tmp_buffer->shape_view().elem_cnt(), TmpBufferManager<T> buffer_manager(tmp_buffer->shape_view().elem_cnt(),
tmp_buffer->mut_dptr<void>(), instance_num); tmp_buffer->mut_dptr<void>(), instance_num);
ArgMax(in->dptr<T>(), instance_num, instance_size, buffer_manager.TempStoragePtr(), ArgMax(in->dptr<T>(), instance_num, instance_size, buffer_manager.TempStoragePtr(),
buffer_manager.TempStorageBytes(), buffer_manager.KeyValueOutPtr(), buffer_manager.TempStorageBytes(), buffer_manager.KeyValueOutPtr(),
ctx->stream()->As<ep::CudaStream>()->cuda_stream()); ctx->stream()->As<ep::CudaStream>()->cuda_stream());
WriteKeysToOutput<T><<<BlocksNum4ThreadsNum(instance_num), kCudaThreadsNumPerBlock, 0, WriteKeysToOutput<T><<<BlocksNum4ThreadsNum(instance_num), kCudaThreadsNumPerBlock, 0,
ctx->stream()->As<ep::CudaStream>()->cuda_stream()>>>( ctx->stream()->As<ep::CudaStream>()->cuda_stream()>>>(
instance_num, instance_size, buffer_manager.KeyValueOutPtr(), out->mut_dptr<int64_t>()); instance_num, instance_size, buffer_manager.KeyValueOutPtr(), out->mut_dptr<int64_t>());
} }
bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
}; };
#define REGISTER_CUDA_ARGMAX_KERNEL(dtype) \ #define REGISTER_CUDA_ARGMAX_KERNEL(dtype) \
REGISTER_USER_KERNEL("argmax") \ REGISTER_USER_KERNEL("argmax") \
.SetCreateFn<GpuArgMaxKernel<dtype>>() \ .SetCreateFn<GpuArgMaxKernel<dtype>>() \
.SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \
&& (user_op::HobDataType("in", 0) == GetDataType<dtype>::value)) \ && (user_op::HobDataType("in", 0) == GetDataType<dtype>::value)) \
.SetInferTmpSizeFn([](user_op::InferContext* ctx) { \ .SetInferTmpSizeFn([](user_op::InferContext* ctx) { \
const Shape& in_shape = ctx->InputShape("in", 0); \ const Shape& in_shape = ctx->InputShape("in", 0); \
const int32_t instance_size = in_shape.dim_vec().back(); \ const int32_t instance_size = in_shape.dim_vec().back(); \
const int32_t instance_num = in_shape.elem_cnt() / instance_size; \ const int32_t instance_num = in_shape.elem_cnt() / instance_size; \
\ \
/* Key-Value Out */ \ /* Key-Value Out */ \
int32_t key_value_out_bytes = \ int32_t key_value_out_bytes = \
GetCudaAlignedSize(instance_num * sizeof(hipcub::KeyValuePair<int32_t, dtype>)); \ GetCudaAlignedSize(instance_num * sizeof(hipcub::KeyValuePair<int32_t, dtype>)); \
\ \
/* CUB Temp Storage */ \ /* CUB Temp Storage */ \
size_t temp_storage_bytes = InferTempStorageForArgMax<dtype>(instance_num, instance_size); \ size_t temp_storage_bytes = InferTempStorageForArgMax<dtype>(instance_num, instance_size); \
\ \
return key_value_out_bytes + temp_storage_bytes; \ return key_value_out_bytes + temp_storage_bytes; \
}); });
REGISTER_CUDA_ARGMAX_KERNEL(float) REGISTER_CUDA_ARGMAX_KERNEL(float)
REGISTER_CUDA_ARGMAX_KERNEL(double) REGISTER_CUDA_ARGMAX_KERNEL(double)
REGISTER_CUDA_ARGMAX_KERNEL(uint8_t) REGISTER_CUDA_ARGMAX_KERNEL(uint8_t)
REGISTER_CUDA_ARGMAX_KERNEL(int8_t) REGISTER_CUDA_ARGMAX_KERNEL(int8_t)
REGISTER_CUDA_ARGMAX_KERNEL(int32_t) REGISTER_CUDA_ARGMAX_KERNEL(int32_t)
REGISTER_CUDA_ARGMAX_KERNEL(int64_t) REGISTER_CUDA_ARGMAX_KERNEL(int64_t)
} // namespace oneflow } // namespace oneflow
\ No newline at end of file
#include "hip/hip_runtime.h" #include "hip/hip_runtime.h"
/* /*
Copyright 2020 The OneFlow Authors. All rights reserved. Copyright 2020 The OneFlow Authors. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License"); Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License. you may not use this file except in compliance with the License.
You may obtain a copy of the License at You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0 http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS, distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. limitations under the License.
*/ */
#include <cstdint> #include <cstdint>
#include "oneflow/core/hip/atomic.hip.h" #include "oneflow/core/hip/atomic.hip.h"
#include "oneflow/core/common/just.h" #include "oneflow/core/common/just.h"
#include "oneflow/core/common/util.h" #include "oneflow/core/common/util.h"
#include "oneflow/core/framework/consistency_check.h" #include "oneflow/core/framework/consistency_check.h"
#include "oneflow/core/framework/framework.h" #include "oneflow/core/framework/framework.h"
#include "oneflow/core/kernel/new_kernel_util.h" #include "oneflow/core/kernel/new_kernel_util.h"
#include "oneflow/core/kernel/kernel_util.h" #include "oneflow/core/kernel/kernel_util.h"
#include "oneflow/core/ep/rocm/cuda_stream.h" #include "oneflow/core/ep/rocm/cuda_stream.h"
#include "oneflow/core/common/nd_index_offset_helper.h" #include "oneflow/core/common/nd_index_offset_helper.h"
namespace oneflow { namespace oneflow {
namespace { namespace {
constexpr size_t NUM_DIM = 8; constexpr size_t NUM_DIM = 8;
template<size_t num_dims, typename IndexType> template<size_t num_dims, typename IndexType>
struct AsStridedParams { struct AsStridedParams {
NdIndexOffsetHelper<IndexType, num_dims> destIndexOffsetHelper; NdIndexOffsetHelper<IndexType, num_dims> destIndexOffsetHelper;
int64_t dest_dims[num_dims]; int64_t dest_dims[num_dims];
int32_t stride[num_dims]; int32_t stride[num_dims];
int32_t dest_num_dims; int32_t dest_num_dims;
int32_t storage_offset; int32_t storage_offset;
int32_t input_num; int32_t input_num;
int32_t output_num; int32_t output_num;
}; };
template<typename T> template<typename T>
__global__ void AsStrided_kernel(const T* input_buf, T* output_buf, __global__ void AsStrided_kernel(const T* input_buf, T* output_buf,
AsStridedParams<NUM_DIM, int64_t> params) { AsStridedParams<NUM_DIM, int64_t> params) {
const int64_t* dest_dims = reinterpret_cast<const int64_t*>(params.dest_dims); const int64_t* dest_dims = reinterpret_cast<const int64_t*>(params.dest_dims);
const int32_t* stride = reinterpret_cast<const int32_t*>(params.stride); const int32_t* stride = reinterpret_cast<const int32_t*>(params.stride);
CUDA_1D_KERNEL_LOOP_T(int64_t, i, params.output_num) { CUDA_1D_KERNEL_LOOP_T(int64_t, i, params.output_num) {
int64_t dst_index[NUM_DIM]; int64_t dst_index[NUM_DIM];
params.destIndexOffsetHelper.OffsetToNdIndex(i, dst_index, params.dest_num_dims); params.destIndexOffsetHelper.OffsetToNdIndex(i, dst_index, params.dest_num_dims);
int32_t index_in_input = params.storage_offset; int32_t index_in_input = params.storage_offset;
FOR_RANGE(int64_t, j, 0, params.dest_num_dims) { index_in_input += dst_index[j] * stride[j]; } FOR_RANGE(int64_t, j, 0, params.dest_num_dims) { index_in_input += dst_index[j] * stride[j]; }
output_buf[i] = input_buf[index_in_input]; output_buf[i] = input_buf[index_in_input];
} }
} }
template<typename T> template<typename T>
__global__ void AsStridedGrad_kernel(const T* dy_buf, T* dx_buf, __global__ void AsStridedGrad_kernel(const T* dy_buf, T* dx_buf,
AsStridedParams<NUM_DIM, int64_t> params) { AsStridedParams<NUM_DIM, int64_t> params) {
const int64_t* dest_dims = reinterpret_cast<const int64_t*>(params.dest_dims); const int64_t* dest_dims = reinterpret_cast<const int64_t*>(params.dest_dims);
const int32_t* stride = reinterpret_cast<const int32_t*>(params.stride); const int32_t* stride = reinterpret_cast<const int32_t*>(params.stride);
CUDA_1D_KERNEL_LOOP_T(int64_t, i, params.output_num) { CUDA_1D_KERNEL_LOOP_T(int64_t, i, params.output_num) {
int64_t dy_index[NUM_DIM]; int64_t dy_index[NUM_DIM];
params.destIndexOffsetHelper.OffsetToNdIndex(i, dy_index, params.dest_num_dims); params.destIndexOffsetHelper.OffsetToNdIndex(i, dy_index, params.dest_num_dims);
int32_t index_in_dx = params.storage_offset; int32_t index_in_dx = params.storage_offset;
FOR_RANGE(int64_t, j, 0, params.dest_num_dims) { index_in_dx += dy_index[j] * stride[j]; } FOR_RANGE(int64_t, j, 0, params.dest_num_dims) { index_in_dx += dy_index[j] * stride[j]; }
cuda::atomic::Add(dx_buf + index_in_dx, dy_buf[i]); cuda::atomic::Add(dx_buf + index_in_dx, dy_buf[i]);
} }
} }
template<typename T> template<typename T>
struct AsStridedFunctor final { struct AsStridedFunctor final {
void operator()(ep::Stream* stream, const T* input_buf, T* output_buf, const int64_t* dest_dims, void operator()(ep::Stream* stream, const T* input_buf, T* output_buf, const int64_t* dest_dims,
const int32_t* stride, const int32_t dest_num_dims, const int32_t storage_offset, const int32_t* stride, const int32_t dest_num_dims, const int32_t storage_offset,
const int32_t input_num, const int32_t output_num) { const int32_t input_num, const int32_t output_num) {
NdIndexOffsetHelper<int64_t, NUM_DIM> destIndexOffsetHelper(dest_dims, dest_num_dims); NdIndexOffsetHelper<int64_t, NUM_DIM> destIndexOffsetHelper(dest_dims, dest_num_dims);
AsStridedParams<NUM_DIM, int64_t> params; AsStridedParams<NUM_DIM, int64_t> params;
params.destIndexOffsetHelper = destIndexOffsetHelper; params.destIndexOffsetHelper = destIndexOffsetHelper;
FOR_RANGE(size_t, i, 0, dest_num_dims) { FOR_RANGE(size_t, i, 0, dest_num_dims) {
params.dest_dims[i] = dest_dims[i]; params.dest_dims[i] = dest_dims[i];
params.stride[i] = stride[i]; params.stride[i] = stride[i];
} }
params.dest_num_dims = dest_num_dims; params.dest_num_dims = dest_num_dims;
params.storage_offset = storage_offset; params.storage_offset = storage_offset;
params.input_num = input_num; params.input_num = input_num;
params.output_num = output_num; params.output_num = output_num;
AsStrided_kernel<T> AsStrided_kernel<T>
<<<BlocksNum4ThreadsNum(output_num), kCudaThreadsNumPerBlock, 0, <<<BlocksNum4ThreadsNum(output_num), kCudaThreadsNumPerBlock, 0,
stream->As<ep::CudaStream>()->cuda_stream()>>>(input_buf, output_buf, params); stream->As<ep::CudaStream>()->cuda_stream()>>>(input_buf, output_buf, params);
} }
}; };
template<typename T> template<typename T>
struct AsStridedGradFunctor final { struct AsStridedGradFunctor final {
void operator()(ep::Stream* stream, const T* dy_buf, T* dx_buf, const int64_t* dy_dims, void operator()(ep::Stream* stream, const T* dy_buf, T* dx_buf, const int64_t* dy_dims,
const int32_t* stride, const int32_t dy_num_dims, const int32_t storage_offset, const int32_t* stride, const int32_t dy_num_dims, const int32_t storage_offset,
const int32_t dx_num, const int32_t dy_num) { const int32_t dx_num, const int32_t dy_num) {
NdIndexOffsetHelper<int64_t, NUM_DIM> dyIndexOffsetHelper(dy_dims, dy_num_dims); NdIndexOffsetHelper<int64_t, NUM_DIM> dyIndexOffsetHelper(dy_dims, dy_num_dims);
AsStridedParams<NUM_DIM, int64_t> params; AsStridedParams<NUM_DIM, int64_t> params;
params.destIndexOffsetHelper = dyIndexOffsetHelper; params.destIndexOffsetHelper = dyIndexOffsetHelper;
FOR_RANGE(size_t, i, 0, dy_num_dims) { FOR_RANGE(size_t, i, 0, dy_num_dims) {
params.dest_dims[i] = dy_dims[i]; params.dest_dims[i] = dy_dims[i];
params.stride[i] = stride[i]; params.stride[i] = stride[i];
} }
params.dest_num_dims = dy_num_dims; params.dest_num_dims = dy_num_dims;
params.storage_offset = storage_offset; params.storage_offset = storage_offset;
params.input_num = dx_num; params.input_num = dx_num;
params.output_num = dy_num; params.output_num = dy_num;
AsStridedGrad_kernel<T> AsStridedGrad_kernel<T>
<<<BlocksNum4ThreadsNum(dy_num), kCudaThreadsNumPerBlock, 0, <<<BlocksNum4ThreadsNum(dy_num), kCudaThreadsNumPerBlock, 0,
stream->As<ep::CudaStream>()->cuda_stream()>>>(dy_buf, dx_buf, params); stream->As<ep::CudaStream>()->cuda_stream()>>>(dy_buf, dx_buf, params);
} }
}; };
} // namespace } // namespace
template<typename T> template<typename T>
class GpuAsStridedKernel final : public user_op::OpKernel { class GpuAsStridedKernel final : public user_op::OpKernel {
public: public:
GpuAsStridedKernel() = default; GpuAsStridedKernel() = default;
~GpuAsStridedKernel() = default; ~GpuAsStridedKernel() = default;
private: private:
using user_op::OpKernel::Compute; using user_op::OpKernel::Compute;
void Compute(user_op::KernelComputeContext* ctx) const override { void Compute(user_op::KernelComputeContext* ctx) const override {
const user_op::Tensor* input = ctx->Tensor4ArgNameAndIndex("input", 0); const user_op::Tensor* input = ctx->Tensor4ArgNameAndIndex("input", 0);
user_op::Tensor* output = ctx->Tensor4ArgNameAndIndex("output", 0); user_op::Tensor* output = ctx->Tensor4ArgNameAndIndex("output", 0);
const auto size = ctx->Attr<std::vector<int32_t>>("size"); const auto size = ctx->Attr<std::vector<int32_t>>("size");
const auto stride = ctx->Attr<std::vector<int32_t>>("stride"); const auto stride = ctx->Attr<std::vector<int32_t>>("stride");
const int32_t storage_offset = ctx->Attr<int32_t>("storage_offset"); const int32_t storage_offset = ctx->Attr<int32_t>("storage_offset");
size_t dest_num_dims = output->shape_view().NumAxes(); size_t dest_num_dims = output->shape_view().NumAxes();
const int64_t* dest_dims = output->shape_view().ptr(); const int64_t* dest_dims = output->shape_view().ptr();
const size_t input_num = input->shape_view().Count(0); const size_t input_num = input->shape_view().Count(0);
const size_t output_num = output->shape_view().Count(0); const size_t output_num = output->shape_view().Count(0);
if (input_num == 0) { if (input_num == 0) {
// 0-size tensor // 0-size tensor
return; return;
} }
AsStridedFunctor<T>()(ctx->stream(), input->dptr<T>(), output->mut_dptr<T>(), dest_dims, AsStridedFunctor<T>()(ctx->stream(), input->dptr<T>(), output->mut_dptr<T>(), dest_dims,
stride.data(), dest_num_dims, storage_offset, input_num, output_num); stride.data(), dest_num_dims, storage_offset, input_num, output_num);
} }
bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
}; };
template<typename T> template<typename T>
class GpuAsStridedGradKernel final : public user_op::OpKernel { class GpuAsStridedGradKernel final : public user_op::OpKernel {
public: public:
GpuAsStridedGradKernel() = default; GpuAsStridedGradKernel() = default;
~GpuAsStridedGradKernel() = default; ~GpuAsStridedGradKernel() = default;
private: private:
using user_op::OpKernel::Compute; using user_op::OpKernel::Compute;
void Compute(user_op::KernelComputeContext* ctx) const override { void Compute(user_op::KernelComputeContext* ctx) const override {
const user_op::Tensor* dy = ctx->Tensor4ArgNameAndIndex("dy", 0); const user_op::Tensor* dy = ctx->Tensor4ArgNameAndIndex("dy", 0);
user_op::Tensor* dx = ctx->Tensor4ArgNameAndIndex("dx", 0); user_op::Tensor* dx = ctx->Tensor4ArgNameAndIndex("dx", 0);
const auto size = ctx->Attr<std::vector<int32_t>>("size"); const auto size = ctx->Attr<std::vector<int32_t>>("size");
const auto stride = ctx->Attr<std::vector<int32_t>>("stride"); const auto stride = ctx->Attr<std::vector<int32_t>>("stride");
const int32_t storage_offset = ctx->Attr<int32_t>("storage_offset"); const int32_t storage_offset = ctx->Attr<int32_t>("storage_offset");
size_t dy_num_dims = dy->shape_view().NumAxes(); size_t dy_num_dims = dy->shape_view().NumAxes();
const int64_t* dy_dims = dy->shape_view().ptr(); const int64_t* dy_dims = dy->shape_view().ptr();
const size_t dx_num = dx->shape_view().Count(0); const size_t dx_num = dx->shape_view().Count(0);
const size_t dy_num = dy->shape_view().Count(0); const size_t dy_num = dy->shape_view().Count(0);
Memset<DeviceType::kCUDA>(ctx->stream(), dx->mut_dptr(), 0, Memset<DeviceType::kCUDA>(ctx->stream(), dx->mut_dptr(), 0,
dx->shape_view().Count(0) * sizeof(T)); dx->shape_view().Count(0) * sizeof(T));
AsStridedGradFunctor<T>()(ctx->stream(), dy->dptr<T>(), dx->mut_dptr<T>(), dy_dims, AsStridedGradFunctor<T>()(ctx->stream(), dy->dptr<T>(), dx->mut_dptr<T>(), dy_dims,
stride.data(), dy_num_dims, storage_offset, dx_num, dy_num); stride.data(), dy_num_dims, storage_offset, dx_num, dy_num);
} }
bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
}; };
#define REGISTER_GPUASSTRIDED_KERNEL(in_type) \ #define REGISTER_GPUASSTRIDED_KERNEL(in_type) \
REGISTER_USER_KERNEL("as_strided") \ REGISTER_USER_KERNEL("as_strided") \
.SetCreateFn<GpuAsStridedKernel<in_type>>() \ .SetCreateFn<GpuAsStridedKernel<in_type>>() \
.SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \
&& (user_op::HobDataType("input", 0) == GetDataType<in_type>::value)); \ && (user_op::HobDataType("input", 0) == GetDataType<in_type>::value)); \
REGISTER_USER_KERNEL("as_strided_grad") \ REGISTER_USER_KERNEL("as_strided_grad") \
.SetCreateFn<GpuAsStridedGradKernel<in_type>>() \ .SetCreateFn<GpuAsStridedGradKernel<in_type>>() \
.SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \
&& (user_op::HobDataType("input", 0) == GetDataType<in_type>::value)); && (user_op::HobDataType("input", 0) == GetDataType<in_type>::value));
REGISTER_GPUASSTRIDED_KERNEL(half); REGISTER_GPUASSTRIDED_KERNEL(half);
REGISTER_GPUASSTRIDED_KERNEL(float); REGISTER_GPUASSTRIDED_KERNEL(float);
REGISTER_GPUASSTRIDED_KERNEL(double); REGISTER_GPUASSTRIDED_KERNEL(double);
REGISTER_GPUASSTRIDED_KERNEL(int64_t); REGISTER_GPUASSTRIDED_KERNEL(int64_t);
#undef REGISTER_GPUASSTRIDED_KERNEL #undef REGISTER_GPUASSTRIDED_KERNEL
} // namespace oneflow } // namespace oneflow
\ No newline at end of file
/* /*
Copyright 2020 The OneFlow Authors. All rights reserved. Copyright 2020 The OneFlow Authors. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License"); Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License. you may not use this file except in compliance with the License.
You may obtain a copy of the License at You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0 http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS, distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. limitations under the License.
*/ */
#include "hip/hip_runtime.h" #include "hip/hip_runtime.h"
#include "oneflow/core/framework/framework.h" #include "oneflow/core/framework/framework.h"
#include "oneflow/core/kernel/kernel_util.h" #include "oneflow/core/kernel/kernel_util.h"
#include "oneflow/core/ep/rocm/cuda_stream.h" #include "oneflow/core/ep/rocm/cuda_stream.h"
namespace oneflow { namespace oneflow {
namespace { namespace {
template<bool assign_if, typename C, typename T> template<bool assign_if, typename C, typename T>
__global__ void AssignGpu(int64_t elem_cnt, const C* condition, const T* value, T* ref) { __global__ void AssignGpu(int64_t elem_cnt, const C* condition, const T* value, T* ref) {
if (assign_if == (*condition == 0)) { return; } if (assign_if == (*condition == 0)) { return; }
CUDA_1D_KERNEL_LOOP(i, elem_cnt) { ref[i] = value[i]; } CUDA_1D_KERNEL_LOOP(i, elem_cnt) { ref[i] = value[i]; }
} }
template<bool assign_if, typename C, typename T> template<bool assign_if, typename C, typename T>
class AssignIfGPUKernel final : public user_op::OpKernel { class AssignIfGPUKernel final : public user_op::OpKernel {
public: public:
AssignIfGPUKernel() = default; AssignIfGPUKernel() = default;
~AssignIfGPUKernel() override = default; ~AssignIfGPUKernel() override = default;
private: private:
using user_op::OpKernel::Compute; using user_op::OpKernel::Compute;
void Compute(user_op::KernelComputeContext* ctx) const override { void Compute(user_op::KernelComputeContext* ctx) const override {
const user_op::Tensor* condition = ctx->Tensor4ArgNameAndIndex("condition", 0); const user_op::Tensor* condition = ctx->Tensor4ArgNameAndIndex("condition", 0);
CHECK_EQ(condition->shape_view().NumAxes(), 1); CHECK_EQ(condition->shape_view().NumAxes(), 1);
CHECK_EQ(condition->shape_view().At(0), 1); CHECK_EQ(condition->shape_view().At(0), 1);
const user_op::Tensor* value = ctx->Tensor4ArgNameAndIndex("value", 0); const user_op::Tensor* value = ctx->Tensor4ArgNameAndIndex("value", 0);
user_op::Tensor* ref = ctx->Tensor4ArgNameAndIndex("ref", 0); user_op::Tensor* ref = ctx->Tensor4ArgNameAndIndex("ref", 0);
if (value->dptr() == ref->dptr()) { return; } if (value->dptr() == ref->dptr()) { return; }
CHECK_EQ(value->shape_view(), ref->shape_view()); CHECK_EQ(value->shape_view(), ref->shape_view());
CHECK_EQ(value->data_type(), ref->data_type()); CHECK_EQ(value->data_type(), ref->data_type());
const size_t elem_cnt = ref->shape_view().elem_cnt(); const size_t elem_cnt = ref->shape_view().elem_cnt();
AssignGpu<assign_if, C, T><<<BlocksNum4ThreadsNum(elem_cnt), kCudaThreadsNumPerBlock, 0, AssignGpu<assign_if, C, T><<<BlocksNum4ThreadsNum(elem_cnt), kCudaThreadsNumPerBlock, 0,
ctx->stream()->As<ep::CudaStream>()->cuda_stream()>>>( ctx->stream()->As<ep::CudaStream>()->cuda_stream()>>>(
elem_cnt, condition->dptr<C>(), value->dptr<T>(), ref->mut_dptr<T>()); elem_cnt, condition->dptr<C>(), value->dptr<T>(), ref->mut_dptr<T>());
} }
bool AlwaysComputeWhenAllOutputsEmpty() const override { return true; } bool AlwaysComputeWhenAllOutputsEmpty() const override { return true; }
}; };
} // namespace } // namespace
#define REGISTER_ASSIGN_WITH_CONDITION_VALUE_CUDA_KERNEL(op_type_name, assign_if, condition_type, \ #define REGISTER_ASSIGN_WITH_CONDITION_VALUE_CUDA_KERNEL(op_type_name, assign_if, condition_type, \
value_type) \ value_type) \
REGISTER_USER_KERNEL(op_type_name) \ REGISTER_USER_KERNEL(op_type_name) \
.SetCreateFn<AssignIfGPUKernel<assign_if, condition_type, value_type>>() \ .SetCreateFn<AssignIfGPUKernel<assign_if, condition_type, value_type>>() \
.SetIsMatchedHob( \ .SetIsMatchedHob( \
(user_op::HobDeviceType() == DeviceType::kCUDA) \ (user_op::HobDeviceType() == DeviceType::kCUDA) \
&& (user_op::HobDataType("condition", 0) == GetDataType<condition_type>::value) \ && (user_op::HobDataType("condition", 0) == GetDataType<condition_type>::value) \
&& (user_op::HobDataType("value", 0) == GetDataType<value_type>::value)); && (user_op::HobDataType("value", 0) == GetDataType<value_type>::value));
#define REGISTER_ASSIGN_IF_CUDA_KERNEL(condition_type, value_type) \ #define REGISTER_ASSIGN_IF_CUDA_KERNEL(condition_type, value_type) \
REGISTER_ASSIGN_WITH_CONDITION_VALUE_CUDA_KERNEL( \ REGISTER_ASSIGN_WITH_CONDITION_VALUE_CUDA_KERNEL( \
"assign_if", true, OF_PP_PAIR_FIRST(condition_type), OF_PP_PAIR_FIRST(value_type)); \ "assign_if", true, OF_PP_PAIR_FIRST(condition_type), OF_PP_PAIR_FIRST(value_type)); \
REGISTER_ASSIGN_WITH_CONDITION_VALUE_CUDA_KERNEL( \ REGISTER_ASSIGN_WITH_CONDITION_VALUE_CUDA_KERNEL( \
"assign_if_not", false, OF_PP_PAIR_FIRST(condition_type), OF_PP_PAIR_FIRST(value_type)) "assign_if_not", false, OF_PP_PAIR_FIRST(condition_type), OF_PP_PAIR_FIRST(value_type))
OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(REGISTER_ASSIGN_IF_CUDA_KERNEL, INT_DATA_TYPE_SEQ, OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(REGISTER_ASSIGN_IF_CUDA_KERNEL, INT_DATA_TYPE_SEQ,
POD_DATA_TYPE_SEQ) POD_DATA_TYPE_SEQ)
} // namespace oneflow } // namespace oneflow
\ No newline at end of file
/* /*
Copyright 2020 The OneFlow Authors. All rights reserved. Copyright 2020 The OneFlow Authors. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License"); Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License. you may not use this file except in compliance with the License.
You may obtain a copy of the License at You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0 http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS, distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. limitations under the License.
*/ */
#include "hip/hip_runtime.h" #include "hip/hip_runtime.h"
#include <cstdint> #include <cstdint>
#include "oneflow/core/hip/elementwise.hip.h" #include "oneflow/core/hip/elementwise.hip.h"
#include "oneflow/user/kernels/avg_pool_kernel_util.h" #include "oneflow/user/kernels/avg_pool_kernel_util.h"
#include "oneflow/core/ep/rocm/cuda_stream.h" #include "oneflow/core/ep/rocm/cuda_stream.h"
namespace oneflow { namespace oneflow {
namespace { namespace {
constexpr int kBlockSize = cuda::elementwise::kBlockSize; constexpr int kBlockSize = cuda::elementwise::kBlockSize;
int GetMinThreadNum(const int64_t elem_num) { return std::min<int64_t>(elem_num, kBlockSize); } int GetMinThreadNum(const int64_t elem_num) { return std::min<int64_t>(elem_num, kBlockSize); }
int GetNumBlocks(int32_t elem_cnt) { int GetNumBlocks(int32_t elem_cnt) {
int num_blocks = 0; int num_blocks = 0;
OF_CUDA_CHECK(cuda::elementwise::GetNumBlocks(elem_cnt, &num_blocks)); OF_CUDA_CHECK(cuda::elementwise::GetNumBlocks(elem_cnt, &num_blocks));
return num_blocks; return num_blocks;
} }
} // namespace } // namespace
template<typename T, typename IDX> template<typename T, typename IDX>
__launch_bounds__(kBlockSize) __global__ __launch_bounds__(kBlockSize) __global__
void DoCUDAAvgPool1dForward(const NdIndexOffsetHelper<IDX, 2> index_helper, IDX elem_num, void DoCUDAAvgPool1dForward(const NdIndexOffsetHelper<IDX, 2> index_helper, IDX elem_num,
const T* src, T* dest, int32_t padding_l, const int32_t n_batch, const T* src, T* dest, int32_t padding_l, const int32_t n_batch,
const int32_t n_channel, const int32_t x_length, const int32_t n_channel, const int32_t x_length,
const int32_t kernel_size_l, const int32_t stride_l, const int32_t kernel_size_l, const int32_t stride_l,
const bool count_include_pad, const int32_t divisor_override) { const bool count_include_pad, const int32_t divisor_override) {
Avgpool1dForwardCompute<T>(index_helper, elem_num, src, dest, padding_l, n_batch, n_channel, Avgpool1dForwardCompute<T>(index_helper, elem_num, src, dest, padding_l, n_batch, n_channel,
x_length, kernel_size_l, stride_l, count_include_pad, x_length, kernel_size_l, stride_l, count_include_pad,
divisor_override); divisor_override);
}; };
template<typename T, typename IDX> template<typename T, typename IDX>
__launch_bounds__(kBlockSize) __global__ __launch_bounds__(kBlockSize) __global__
void DoCUDAAvgPool2dForward(const NdIndexOffsetHelper<IDX, 3> index_helper, IDX elem_num, void DoCUDAAvgPool2dForward(const NdIndexOffsetHelper<IDX, 3> index_helper, IDX elem_num,
const T* src, T* dest, const int32_t padding_h, const T* src, T* dest, const int32_t padding_h,
const int32_t padding_w, const int32_t n_batch, const int32_t padding_w, const int32_t n_batch,
const int32_t n_channel, const int32_t x_height, const int32_t n_channel, const int32_t x_height,
const int32_t x_width, const int32_t kernel_size_h, const int32_t x_width, const int32_t kernel_size_h,
const int32_t kernel_size_w, const int32_t stride_h, const int32_t kernel_size_w, const int32_t stride_h,
const int32_t stride_w, const bool count_include_pad, const int32_t stride_w, const bool count_include_pad,
const int32_t divisor_override) { const int32_t divisor_override) {
Avgpool2dForwardCompute<T>(index_helper, elem_num, src, dest, padding_h, padding_w, n_batch, Avgpool2dForwardCompute<T>(index_helper, elem_num, src, dest, padding_h, padding_w, n_batch,
n_channel, x_height, x_width, kernel_size_h, kernel_size_w, stride_h, n_channel, x_height, x_width, kernel_size_h, kernel_size_w, stride_h,
stride_w, count_include_pad, divisor_override); stride_w, count_include_pad, divisor_override);
}; };
template<typename T, typename IDX> template<typename T, typename IDX>
__launch_bounds__(kBlockSize) __global__ __launch_bounds__(kBlockSize) __global__
void DoCUDAAvgPool3dForward(const NdIndexOffsetHelper<IDX, 4> index_helper, IDX elem_num, void DoCUDAAvgPool3dForward(const NdIndexOffsetHelper<IDX, 4> index_helper, IDX elem_num,
const T* src, T* dest, int32_t padding_t, const int32_t padding_h, const T* src, T* dest, int32_t padding_t, const int32_t padding_h,
const int32_t padding_w, const int32_t n_batch, const int32_t padding_w, const int32_t n_batch,
const int32_t n_channel, const int32_t x_time, const int32_t n_channel, const int32_t x_time,
const int32_t x_height, const int32_t x_width, const int32_t x_height, const int32_t x_width,
const int32_t kernel_size_t, int32_t kernel_size_h, const int32_t kernel_size_t, int32_t kernel_size_h,
const int32_t kernel_size_w, const int32_t stride_t, const int32_t kernel_size_w, const int32_t stride_t,
const int32_t stride_h, const int32_t stride_w, const int32_t stride_h, const int32_t stride_w,
const bool count_include_pad, const int32_t divisor_override) { const bool count_include_pad, const int32_t divisor_override) {
Avgpool3dForwardCompute<T>(index_helper, elem_num, src, dest, padding_t, padding_h, padding_w, Avgpool3dForwardCompute<T>(index_helper, elem_num, src, dest, padding_t, padding_h, padding_w,
n_batch, n_channel, x_time, x_height, x_width, kernel_size_t, n_batch, n_channel, x_time, x_height, x_width, kernel_size_t,
kernel_size_h, kernel_size_w, stride_t, stride_h, stride_w, kernel_size_h, kernel_size_w, stride_t, stride_h, stride_w,
count_include_pad, divisor_override); count_include_pad, divisor_override);
}; };
template<typename T, typename IDX> template<typename T, typename IDX>
__launch_bounds__(kBlockSize) __global__ __launch_bounds__(kBlockSize) __global__
void DoCUDAAvgPool1dBackward(const NdIndexOffsetHelper<IDX, 2> index_helper, IDX elem_num, void DoCUDAAvgPool1dBackward(const NdIndexOffsetHelper<IDX, 2> index_helper, IDX elem_num,
const T* src, T* dest, const int32_t padding_l, const T* src, T* dest, const int32_t padding_l,
const int32_t n_batch, const int32_t n_channel, const int32_t n_batch, const int32_t n_channel,
const int32_t input_length, const int32_t kernel_size_l, const int32_t input_length, const int32_t kernel_size_l,
const int32_t stride_l, const bool count_include_pad, const int32_t stride_l, const bool count_include_pad,
const int32_t divisor_override) { const int32_t divisor_override) {
Avgpool1dBackwardCompute<T>(index_helper, elem_num, src, dest, padding_l, n_batch, n_channel, Avgpool1dBackwardCompute<T>(index_helper, elem_num, src, dest, padding_l, n_batch, n_channel,
input_length, kernel_size_l, stride_l, count_include_pad, input_length, kernel_size_l, stride_l, count_include_pad,
divisor_override); divisor_override);
}; };
template<typename T, typename IDX> template<typename T, typename IDX>
__launch_bounds__(kBlockSize) __global__ __launch_bounds__(kBlockSize) __global__
void DoCUDAAvgPool2dBackward(const NdIndexOffsetHelper<IDX, 3> index_helper, IDX elem_num, void DoCUDAAvgPool2dBackward(const NdIndexOffsetHelper<IDX, 3> index_helper, IDX elem_num,
const T* src, T* dest, const int32_t padding_h, const T* src, T* dest, const int32_t padding_h,
const int32_t padding_w, const int32_t n_batch, const int32_t padding_w, const int32_t n_batch,
const int32_t n_channel, const int32_t input_height, const int32_t n_channel, const int32_t input_height,
const int32_t input_width, const int32_t kernel_size_h, const int32_t input_width, const int32_t kernel_size_h,
const int32_t kernel_size_w, const int32_t stride_h, const int32_t kernel_size_w, const int32_t stride_h,
const int32_t stride_w, const bool count_include_pad, const int32_t stride_w, const bool count_include_pad,
int32_t divisor_override) { int32_t divisor_override) {
Avgpool2dBackwardCompute<T>(index_helper, elem_num, src, dest, padding_h, padding_w, n_batch, Avgpool2dBackwardCompute<T>(index_helper, elem_num, src, dest, padding_h, padding_w, n_batch,
n_channel, input_height, input_width, kernel_size_h, kernel_size_w, n_channel, input_height, input_width, kernel_size_h, kernel_size_w,
stride_h, stride_w, count_include_pad, divisor_override); stride_h, stride_w, count_include_pad, divisor_override);
}; };
template<typename T, typename IDX> template<typename T, typename IDX>
__launch_bounds__(kBlockSize) __global__ void DoCUDAAvgPool3dBackward( __launch_bounds__(kBlockSize) __global__ void DoCUDAAvgPool3dBackward(
const NdIndexOffsetHelper<IDX, 4> index_helper, IDX elem_num, const T* src, T* dest, const NdIndexOffsetHelper<IDX, 4> index_helper, IDX elem_num, const T* src, T* dest,
const int32_t padding_t, const int32_t padding_h, const int32_t padding_w, const int32_t padding_t, const int32_t padding_h, const int32_t padding_w,
const int32_t n_batch, const int32_t n_channel, const int32_t x_time, const int32_t x_height, const int32_t n_batch, const int32_t n_channel, const int32_t x_time, const int32_t x_height,
const int32_t x_width, const int32_t kernel_size_t, const int32_t kernel_size_h, const int32_t x_width, const int32_t kernel_size_t, const int32_t kernel_size_h,
const int32_t kernel_size_w, const int32_t stride_t, const int32_t stride_h, const int32_t kernel_size_w, const int32_t stride_t, const int32_t stride_h,
const int32_t stride_w, const bool count_include_pad, const int32_t divisor_override) { const int32_t stride_w, const bool count_include_pad, const int32_t divisor_override) {
Avgpool3dBackwardCompute<T>(index_helper, elem_num, src, dest, padding_t, padding_h, padding_w, Avgpool3dBackwardCompute<T>(index_helper, elem_num, src, dest, padding_t, padding_h, padding_w,
n_batch, n_channel, x_time, x_height, x_width, kernel_size_t, n_batch, n_channel, x_time, x_height, x_width, kernel_size_t,
kernel_size_h, kernel_size_w, stride_t, stride_h, stride_w, kernel_size_h, kernel_size_w, stride_t, stride_h, stride_w,
count_include_pad, divisor_override); count_include_pad, divisor_override);
}; };
template<typename T, typename IDX> template<typename T, typename IDX>
struct AvgPoolKernelUtil<DeviceType::kCUDA, T, IDX> { struct AvgPoolKernelUtil<DeviceType::kCUDA, T, IDX> {
static void Avgpool1dForward(ep::Stream* stream, const NdIndexOffsetHelper<IDX, 2>& index_helper, static void Avgpool1dForward(ep::Stream* stream, const NdIndexOffsetHelper<IDX, 2>& index_helper,
const IDX elem_num, const T* src, T* dest, const IDX elem_num, const T* src, T* dest,
const AvgPoolParams3D& params_3d) { const AvgPoolParams3D& params_3d) {
DoCUDAAvgPool1dForward<T, IDX><<<GetNumBlocks(elem_num), GetMinThreadNum(elem_num), 0, DoCUDAAvgPool1dForward<T, IDX><<<GetNumBlocks(elem_num), GetMinThreadNum(elem_num), 0,
stream->As<ep::CudaStream>()->cuda_stream()>>>( stream->As<ep::CudaStream>()->cuda_stream()>>>(
index_helper, elem_num, src, dest, params_3d.padding()[2], params_3d.num_batch(), index_helper, elem_num, src, dest, params_3d.padding()[2], params_3d.num_batch(),
params_3d.num_channel(), params_3d.GetXShape5D().At(4), params_3d.pool_size_3d()[2], params_3d.num_channel(), params_3d.GetXShape5D().At(4), params_3d.pool_size_3d()[2],
params_3d.stride_3d()[2], params_3d.count_include_pad(), params_3d.divisor_override()); params_3d.stride_3d()[2], params_3d.count_include_pad(), params_3d.divisor_override());
} }
static void Avgpool1dBackward(ep::Stream* stream, const NdIndexOffsetHelper<IDX, 2>& index_helper, static void Avgpool1dBackward(ep::Stream* stream, const NdIndexOffsetHelper<IDX, 2>& index_helper,
const IDX elem_num, const T* src, T* dest, const IDX elem_num, const T* src, T* dest,
const AvgPoolParams3D& params_3d) { const AvgPoolParams3D& params_3d) {
DoCUDAAvgPool1dBackward<T, IDX><<<GetNumBlocks(elem_num), GetMinThreadNum(elem_num), 0, DoCUDAAvgPool1dBackward<T, IDX><<<GetNumBlocks(elem_num), GetMinThreadNum(elem_num), 0,
stream->As<ep::CudaStream>()->cuda_stream()>>>( stream->As<ep::CudaStream>()->cuda_stream()>>>(
index_helper, elem_num, src, dest, params_3d.padding()[2], params_3d.num_batch(), index_helper, elem_num, src, dest, params_3d.padding()[2], params_3d.num_batch(),
params_3d.num_channel(), params_3d.GetXShape5D().At(4), params_3d.pool_size_3d()[2], params_3d.num_channel(), params_3d.GetXShape5D().At(4), params_3d.pool_size_3d()[2],
params_3d.stride_3d()[2], params_3d.count_include_pad(), params_3d.divisor_override()); params_3d.stride_3d()[2], params_3d.count_include_pad(), params_3d.divisor_override());
} }
static void Avgpool2dForward(ep::Stream* stream, const NdIndexOffsetHelper<IDX, 3>& index_helper, static void Avgpool2dForward(ep::Stream* stream, const NdIndexOffsetHelper<IDX, 3>& index_helper,
const IDX elem_num, const T* src, T* dest, const IDX elem_num, const T* src, T* dest,
const AvgPoolParams3D& params_3d) { const AvgPoolParams3D& params_3d) {
DoCUDAAvgPool2dForward<T, IDX><<<GetNumBlocks(elem_num), GetMinThreadNum(elem_num), 0, DoCUDAAvgPool2dForward<T, IDX><<<GetNumBlocks(elem_num), GetMinThreadNum(elem_num), 0,
stream->As<ep::CudaStream>()->cuda_stream()>>>( stream->As<ep::CudaStream>()->cuda_stream()>>>(
index_helper, elem_num, src, dest, params_3d.padding()[1], params_3d.padding()[2], index_helper, elem_num, src, dest, params_3d.padding()[1], params_3d.padding()[2],
params_3d.num_batch(), params_3d.num_channel(), params_3d.GetXShape5D().At(3), params_3d.num_batch(), params_3d.num_channel(), params_3d.GetXShape5D().At(3),
params_3d.GetXShape5D().At(4), params_3d.pool_size_3d()[1], params_3d.pool_size_3d()[2], params_3d.GetXShape5D().At(4), params_3d.pool_size_3d()[1], params_3d.pool_size_3d()[2],
params_3d.stride_3d()[1], params_3d.stride_3d()[2], params_3d.count_include_pad(), params_3d.stride_3d()[1], params_3d.stride_3d()[2], params_3d.count_include_pad(),
params_3d.divisor_override()); params_3d.divisor_override());
} }
static void Avgpool2dBackward(ep::Stream* stream, const NdIndexOffsetHelper<IDX, 3>& index_helper, static void Avgpool2dBackward(ep::Stream* stream, const NdIndexOffsetHelper<IDX, 3>& index_helper,
const IDX elem_num, const T* src, T* dest, const IDX elem_num, const T* src, T* dest,
const AvgPoolParams3D& params_3d) { const AvgPoolParams3D& params_3d) {
DoCUDAAvgPool2dBackward<T, IDX><<<GetNumBlocks(elem_num), GetMinThreadNum(elem_num), 0, DoCUDAAvgPool2dBackward<T, IDX><<<GetNumBlocks(elem_num), GetMinThreadNum(elem_num), 0,
stream->As<ep::CudaStream>()->cuda_stream()>>>( stream->As<ep::CudaStream>()->cuda_stream()>>>(
index_helper, elem_num, src, dest, params_3d.padding()[1], params_3d.padding()[2], index_helper, elem_num, src, dest, params_3d.padding()[1], params_3d.padding()[2],
params_3d.num_batch(), params_3d.num_channel(), params_3d.GetXShape5D().At(3), params_3d.num_batch(), params_3d.num_channel(), params_3d.GetXShape5D().At(3),
params_3d.GetXShape5D().At(4), params_3d.pool_size_3d()[1], params_3d.pool_size_3d()[2], params_3d.GetXShape5D().At(4), params_3d.pool_size_3d()[1], params_3d.pool_size_3d()[2],
params_3d.stride_3d()[1], params_3d.stride_3d()[2], params_3d.count_include_pad(), params_3d.stride_3d()[1], params_3d.stride_3d()[2], params_3d.count_include_pad(),
params_3d.divisor_override()); params_3d.divisor_override());
} }
static void Avgpool3dForward(ep::Stream* stream, const NdIndexOffsetHelper<IDX, 4>& index_helper, static void Avgpool3dForward(ep::Stream* stream, const NdIndexOffsetHelper<IDX, 4>& index_helper,
const IDX elem_num, const T* src, T* dest, const IDX elem_num, const T* src, T* dest,
const AvgPoolParams3D& params_3d) { const AvgPoolParams3D& params_3d) {
DoCUDAAvgPool3dForward<T, IDX><<<GetNumBlocks(elem_num), GetMinThreadNum(elem_num), 0, DoCUDAAvgPool3dForward<T, IDX><<<GetNumBlocks(elem_num), GetMinThreadNum(elem_num), 0,
stream->As<ep::CudaStream>()->cuda_stream()>>>( stream->As<ep::CudaStream>()->cuda_stream()>>>(
index_helper, elem_num, src, dest, params_3d.padding()[0], params_3d.padding()[1], index_helper, elem_num, src, dest, params_3d.padding()[0], params_3d.padding()[1],
params_3d.padding()[2], params_3d.num_batch(), params_3d.num_channel(), params_3d.padding()[2], params_3d.num_batch(), params_3d.num_channel(),
params_3d.GetXShape5D().At(2), params_3d.GetXShape5D().At(3), params_3d.GetXShape5D().At(4), params_3d.GetXShape5D().At(2), params_3d.GetXShape5D().At(3), params_3d.GetXShape5D().At(4),
params_3d.pool_size_3d()[0], params_3d.pool_size_3d()[1], params_3d.pool_size_3d()[2], params_3d.pool_size_3d()[0], params_3d.pool_size_3d()[1], params_3d.pool_size_3d()[2],
params_3d.stride_3d()[0], params_3d.stride_3d()[1], params_3d.stride_3d()[2], params_3d.stride_3d()[0], params_3d.stride_3d()[1], params_3d.stride_3d()[2],
params_3d.count_include_pad(), params_3d.divisor_override()); params_3d.count_include_pad(), params_3d.divisor_override());
} }
static void Avgpool3dBackward(ep::Stream* stream, const NdIndexOffsetHelper<IDX, 4>& index_helper, static void Avgpool3dBackward(ep::Stream* stream, const NdIndexOffsetHelper<IDX, 4>& index_helper,
const IDX elem_num, const T* src, T* dest, const IDX elem_num, const T* src, T* dest,
const AvgPoolParams3D& params_3d) { const AvgPoolParams3D& params_3d) {
DoCUDAAvgPool3dBackward<T, IDX><<<GetNumBlocks(elem_num), GetMinThreadNum(elem_num), 0, DoCUDAAvgPool3dBackward<T, IDX><<<GetNumBlocks(elem_num), GetMinThreadNum(elem_num), 0,
stream->As<ep::CudaStream>()->cuda_stream()>>>( stream->As<ep::CudaStream>()->cuda_stream()>>>(
index_helper, elem_num, src, dest, params_3d.padding()[0], params_3d.padding()[1], index_helper, elem_num, src, dest, params_3d.padding()[0], params_3d.padding()[1],
params_3d.padding()[2], params_3d.num_batch(), params_3d.num_channel(), params_3d.padding()[2], params_3d.num_batch(), params_3d.num_channel(),
params_3d.GetXShape5D().At(2), params_3d.GetXShape5D().At(3), params_3d.GetXShape5D().At(4), params_3d.GetXShape5D().At(2), params_3d.GetXShape5D().At(3), params_3d.GetXShape5D().At(4),
params_3d.pool_size_3d()[0], params_3d.pool_size_3d()[1], params_3d.pool_size_3d()[2], params_3d.pool_size_3d()[0], params_3d.pool_size_3d()[1], params_3d.pool_size_3d()[2],
params_3d.stride_3d()[0], params_3d.stride_3d()[1], params_3d.stride_3d()[2], params_3d.stride_3d()[0], params_3d.stride_3d()[1], params_3d.stride_3d()[2],
params_3d.count_include_pad(), params_3d.divisor_override()); params_3d.count_include_pad(), params_3d.divisor_override());
} }
}; };
OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INSTANTIATE_AVG_POOL_KERNEL_UTIL, (DeviceType::kCUDA), OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INSTANTIATE_AVG_POOL_KERNEL_UTIL, (DeviceType::kCUDA),
AVG_POOL_DATA_TYPE_CUDA_SEQ, AVG_POOL_IDX_DATA_TYPE_SEQ); AVG_POOL_DATA_TYPE_CUDA_SEQ, AVG_POOL_IDX_DATA_TYPE_SEQ);
} // namespace oneflow } // namespace oneflow
\ No newline at end of file
/* /*
Copyright 2020 The OneFlow Authors. All rights reserved. Copyright 2020 The OneFlow Authors. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License"); Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License. you may not use this file except in compliance with the License.
You may obtain a copy of the License at You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0 http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS, distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. limitations under the License.
*/ */
#include "hip/hip_runtime.h" #include "hip/hip_runtime.h"
#include "oneflow/user/kernels/batch_gather_kernel_util.h" #include "oneflow/user/kernels/batch_gather_kernel_util.h"
#include "oneflow/core/hip/atomic.hip.h" #include "oneflow/core/hip/atomic.hip.h"
#include "oneflow/core/ep/rocm/cuda_stream.h" #include "oneflow/core/ep/rocm/cuda_stream.h"
#include <assert.h> #include <assert.h>
namespace oneflow { namespace oneflow {
namespace { namespace {
template<typename K> template<typename K>
__device__ int64_t GetInOffset(const int64_t out_offset, const K* indices, __device__ int64_t GetInOffset(const int64_t out_offset, const K* indices,
const int64_t indices_num, const int64_t instance_size, const int64_t indices_num, const int64_t instance_size,
const int64_t gather_dim_size) { const int64_t gather_dim_size) {
const int64_t batch_idx = out_offset / (indices_num * instance_size); const int64_t batch_idx = out_offset / (indices_num * instance_size);
const int64_t indices_idx = out_offset % (indices_num * instance_size) / instance_size; const int64_t indices_idx = out_offset % (indices_num * instance_size) / instance_size;
const int64_t inner_idx = out_offset % instance_size; const int64_t inner_idx = out_offset % instance_size;
const int64_t idx = indices[batch_idx * indices_num + indices_idx]; const int64_t idx = indices[batch_idx * indices_num + indices_idx];
assert(idx >= 0 && idx < gather_dim_size); assert(idx >= 0 && idx < gather_dim_size);
return batch_idx * gather_dim_size * instance_size + idx * instance_size + inner_idx; return batch_idx * gather_dim_size * instance_size + idx * instance_size + inner_idx;
} }
template<typename T, typename K> template<typename T, typename K>
__global__ void BatchGatherForwardGpu(const int64_t elem_cnt, const T* in, const K* indices, __global__ void BatchGatherForwardGpu(const int64_t elem_cnt, const T* in, const K* indices,
const int64_t indices_num, const int64_t instance_size, const int64_t indices_num, const int64_t instance_size,
const int64_t gather_dim_size, T* out) { const int64_t gather_dim_size, T* out) {
CUDA_1D_KERNEL_LOOP(i, elem_cnt) { CUDA_1D_KERNEL_LOOP(i, elem_cnt) {
out[i] = in[GetInOffset<K>(i, indices, indices_num, instance_size, gather_dim_size)]; out[i] = in[GetInOffset<K>(i, indices, indices_num, instance_size, gather_dim_size)];
} }
} }
template<typename T, typename K> template<typename T, typename K>
__global__ void BatchGatherBackwardGpu(const int64_t elem_cnt, const T* out_diff, const K* indices, __global__ void BatchGatherBackwardGpu(const int64_t elem_cnt, const T* out_diff, const K* indices,
const int64_t indices_num, const int64_t instance_size, const int64_t indices_num, const int64_t instance_size,
const int64_t gather_dim_size, T* in_diff) { const int64_t gather_dim_size, T* in_diff) {
CUDA_1D_KERNEL_LOOP(i, elem_cnt) { CUDA_1D_KERNEL_LOOP(i, elem_cnt) {
cuda::atomic::Add( cuda::atomic::Add(
in_diff + GetInOffset<K>(i, indices, indices_num, instance_size, gather_dim_size), in_diff + GetInOffset<K>(i, indices, indices_num, instance_size, gather_dim_size),
out_diff[i]); out_diff[i]);
} }
} }
} // namespace } // namespace
template<typename T, typename K> template<typename T, typename K>
struct BatchGatherKernelUtilImpl<DeviceType::kCUDA, T, K> final { struct BatchGatherKernelUtilImpl<DeviceType::kCUDA, T, K> final {
static void Forward(ep::Stream* stream, const T* in, const K* indices, static void Forward(ep::Stream* stream, const T* in, const K* indices,
const Shape& flat_out_shape, const int64_t gather_dim_size, T* out); const Shape& flat_out_shape, const int64_t gather_dim_size, T* out);
static void Backward(ep::Stream* stream, const T* out_diff, const K* indices, static void Backward(ep::Stream* stream, const T* out_diff, const K* indices,
const Shape& flat_out_diff_shape, const int64_t gather_dim_size, T* in_diff); const Shape& flat_out_diff_shape, const int64_t gather_dim_size, T* in_diff);
}; };
template<typename T, typename K> template<typename T, typename K>
void BatchGatherKernelUtilImpl<DeviceType::kCUDA, T, K>::Forward(ep::Stream* stream, const T* in, void BatchGatherKernelUtilImpl<DeviceType::kCUDA, T, K>::Forward(ep::Stream* stream, const T* in,
const K* indices, const K* indices,
const Shape& flat_out_shape, const Shape& flat_out_shape,
const int64_t gather_dim_size, const int64_t gather_dim_size,
T* out) { T* out) {
const int64_t batch_num = flat_out_shape.At(0); const int64_t batch_num = flat_out_shape.At(0);
const int64_t indices_num = flat_out_shape.At(1); const int64_t indices_num = flat_out_shape.At(1);
const int64_t instance_size = flat_out_shape.At(2); const int64_t instance_size = flat_out_shape.At(2);
const int64_t elem_cnt = batch_num * indices_num * instance_size; const int64_t elem_cnt = batch_num * indices_num * instance_size;
BatchGatherForwardGpu<T, K><<<BlocksNum4ThreadsNum(elem_cnt), kCudaThreadsNumPerBlock, 0, BatchGatherForwardGpu<T, K><<<BlocksNum4ThreadsNum(elem_cnt), kCudaThreadsNumPerBlock, 0,
stream->As<ep::CudaStream>()->cuda_stream()>>>( stream->As<ep::CudaStream>()->cuda_stream()>>>(
elem_cnt, in, indices, indices_num, instance_size, gather_dim_size, out); elem_cnt, in, indices, indices_num, instance_size, gather_dim_size, out);
} }
template<typename T, typename K> template<typename T, typename K>
void BatchGatherKernelUtilImpl<DeviceType::kCUDA, T, K>::Backward( void BatchGatherKernelUtilImpl<DeviceType::kCUDA, T, K>::Backward(
ep::Stream* stream, const T* out_diff, const K* indices, const Shape& flat_out_diff_shape, ep::Stream* stream, const T* out_diff, const K* indices, const Shape& flat_out_diff_shape,
const int64_t gather_dim_size, T* in_diff) { const int64_t gather_dim_size, T* in_diff) {
const int64_t batch_num = flat_out_diff_shape.At(0); const int64_t batch_num = flat_out_diff_shape.At(0);
const int64_t indices_num = flat_out_diff_shape.At(1); const int64_t indices_num = flat_out_diff_shape.At(1);
const int64_t instance_size = flat_out_diff_shape.At(2); const int64_t instance_size = flat_out_diff_shape.At(2);
const int64_t elem_cnt = batch_num * indices_num * instance_size; const int64_t elem_cnt = batch_num * indices_num * instance_size;
BatchGatherBackwardGpu<T, K><<<BlocksNum4ThreadsNum(elem_cnt), kCudaThreadsNumPerBlock, 0, BatchGatherBackwardGpu<T, K><<<BlocksNum4ThreadsNum(elem_cnt), kCudaThreadsNumPerBlock, 0,
stream->As<ep::CudaStream>()->cuda_stream()>>>( stream->As<ep::CudaStream>()->cuda_stream()>>>(
elem_cnt, out_diff, indices, indices_num, instance_size, gather_dim_size, in_diff); elem_cnt, out_diff, indices, indices_num, instance_size, gather_dim_size, in_diff);
} }
#define INSTANTIATE_BATCH_GATHER_KERNEL_UTIL_IMPL_CUDA(in_type_pair, index_type_pair) \ #define INSTANTIATE_BATCH_GATHER_KERNEL_UTIL_IMPL_CUDA(in_type_pair, index_type_pair) \
template struct BatchGatherKernelUtilImpl<DeviceType::kCUDA, OF_PP_PAIR_FIRST(in_type_pair), \ template struct BatchGatherKernelUtilImpl<DeviceType::kCUDA, OF_PP_PAIR_FIRST(in_type_pair), \
OF_PP_PAIR_FIRST(index_type_pair)>; OF_PP_PAIR_FIRST(index_type_pair)>;
OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INSTANTIATE_BATCH_GATHER_KERNEL_UTIL_IMPL_CUDA, OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INSTANTIATE_BATCH_GATHER_KERNEL_UTIL_IMPL_CUDA,
FLOATING_DATA_TYPE_SEQ, INT_DATA_TYPE_SEQ); FLOATING_DATA_TYPE_SEQ, INT_DATA_TYPE_SEQ);
#undef INSTANTIATE_BATCH_GATHER_KERNEL_UTIL_IMPL_CUDA #undef INSTANTIATE_BATCH_GATHER_KERNEL_UTIL_IMPL_CUDA
} // namespace oneflow } // namespace oneflow
\ No newline at end of file
/* /*
Copyright 2020 The OneFlow Authors. All rights reserved. Copyright 2020 The OneFlow Authors. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License"); Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License. you may not use this file except in compliance with the License.
You may obtain a copy of the License at You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0 http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS, distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. limitations under the License.
*/ */
#include "oneflow/core/framework/framework.h" #include "oneflow/core/framework/framework.h"
#include "oneflow/core/hip/elementwise.hip.h" #include "oneflow/core/hip/elementwise.hip.h"
#include "oneflow/user/kernels/loss_kernel_util.h" #include "oneflow/user/kernels/loss_kernel_util.h"
#include "oneflow/core/ep/rocm/cuda_stream.h" #include "oneflow/core/ep/rocm/cuda_stream.h"
namespace oneflow { namespace oneflow {
namespace user_op { namespace user_op {
namespace { namespace {
using namespace loss; using namespace loss;
template<typename T> template<typename T>
struct BinaryCrossEntropyFunctor { struct BinaryCrossEntropyFunctor {
T zero_; T zero_;
T one_; T one_;
T negative_hundred_; T negative_hundred_;
BinaryCrossEntropyFunctor() BinaryCrossEntropyFunctor()
: zero_(GetZeroVal<T>()), one_(GetOneVal<T>()), negative_hundred_(static_cast<T>(-100)) {} : zero_(GetZeroVal<T>()), one_(GetOneVal<T>()), negative_hundred_(static_cast<T>(-100)) {}
__device__ __forceinline__ T operator()(T input_val, T target_val) const { __device__ __forceinline__ T operator()(T input_val, T target_val) const {
assert(input_val >= zero_); assert(input_val >= zero_);
assert(input_val <= one_); assert(input_val <= one_);
return (target_val - one_) * max(static_cast<T>(log(one_ - input_val)), negative_hundred_) return (target_val - one_) * max(static_cast<T>(log(one_ - input_val)), negative_hundred_)
- target_val * max(static_cast<T>(log(input_val)), negative_hundred_); - target_val * max(static_cast<T>(log(input_val)), negative_hundred_);
} }
__device__ __forceinline__ T operator()(T input_val, T target_val, T weight_val) const { __device__ __forceinline__ T operator()(T input_val, T target_val, T weight_val) const {
return (*this)(input_val, target_val) * weight_val; return (*this)(input_val, target_val) * weight_val;
} }
}; };
template<> template<>
struct BinaryCrossEntropyFunctor<float> { struct BinaryCrossEntropyFunctor<float> {
float zero_; float zero_;
float one_; float one_;
float negative_hundred_; float negative_hundred_;
BinaryCrossEntropyFunctor() : zero_(0.f), one_(1.f), negative_hundred_(-100.f) {} BinaryCrossEntropyFunctor() : zero_(0.f), one_(1.f), negative_hundred_(-100.f) {}
__device__ __forceinline__ float operator()(float input_val, float target_val) const { __device__ __forceinline__ float operator()(float input_val, float target_val) const {
assert(input_val >= zero_); assert(input_val >= zero_);
assert(input_val <= one_); assert(input_val <= one_);
return (target_val - one_) * max(logf(one_ - input_val), negative_hundred_) return (target_val - one_) * max(logf(one_ - input_val), negative_hundred_)
- target_val * max(logf(input_val), negative_hundred_); - target_val * max(logf(input_val), negative_hundred_);
} }
__device__ __forceinline__ float operator()(float input_val, float target_val, __device__ __forceinline__ float operator()(float input_val, float target_val,
float weight_val) const { float weight_val) const {
return (*this)(input_val, target_val) * weight_val; return (*this)(input_val, target_val) * weight_val;
} }
}; };
template<> template<>
struct BinaryCrossEntropyFunctor<half> { struct BinaryCrossEntropyFunctor<half> {
BinaryCrossEntropyFunctor<float> float_functor; BinaryCrossEntropyFunctor<float> float_functor;
__device__ __forceinline__ half operator()(half input_val, half target_val) const { __device__ __forceinline__ half operator()(half input_val, half target_val) const {
return __float2half(float_functor(__half2float(input_val), __half2float(target_val))); return __float2half(float_functor(__half2float(input_val), __half2float(target_val)));
} }
__device__ __forceinline__ half operator()(half input_val, half target_val, __device__ __forceinline__ half operator()(half input_val, half target_val,
half weight_val) const { half weight_val) const {
return (*this)(input_val, target_val) * weight_val; return (*this)(input_val, target_val) * weight_val;
} }
}; };
template<typename T> template<typename T>
struct BinaryCrossEntropyGradFunctor { struct BinaryCrossEntropyGradFunctor {
T eps_; T eps_;
T one_; T one_;
BinaryCrossEntropyGradFunctor() : eps_(static_cast<T>(1e-12)), one_(GetOneVal<T>()) {} BinaryCrossEntropyGradFunctor() : eps_(static_cast<T>(1e-12)), one_(GetOneVal<T>()) {}
__device__ __forceinline__ T operator()(T input_val, T target_val, T dy_val) const { __device__ __forceinline__ T operator()(T input_val, T target_val, T dy_val) const {
return dy_val * (input_val - target_val) / max((one_ - input_val) * input_val, eps_); return dy_val * (input_val - target_val) / max((one_ - input_val) * input_val, eps_);
} }
__device__ __forceinline__ T operator()(T input_val, T target_val, T dy_val, T weight_val) const { __device__ __forceinline__ T operator()(T input_val, T target_val, T dy_val, T weight_val) const {
return (*this)(input_val, target_val, dy_val) * weight_val; return (*this)(input_val, target_val, dy_val) * weight_val;
} }
}; };
template<> template<>
struct BinaryCrossEntropyGradFunctor<half> { struct BinaryCrossEntropyGradFunctor<half> {
BinaryCrossEntropyGradFunctor<float> float_functor; BinaryCrossEntropyGradFunctor<float> float_functor;
BinaryCrossEntropyGradFunctor() {} BinaryCrossEntropyGradFunctor() {}
__device__ __forceinline__ half operator()(half input_val, half target_val, half dy_val) const { __device__ __forceinline__ half operator()(half input_val, half target_val, half dy_val) const {
return __float2half( return __float2half(
float_functor(__half2float(input_val), __half2float(target_val), __half2float(dy_val))); float_functor(__half2float(input_val), __half2float(target_val), __half2float(dy_val)));
} }
__device__ __forceinline__ half operator()(half input_val, half target_val, half dy_val, __device__ __forceinline__ half operator()(half input_val, half target_val, half dy_val,
half weight_val) const { half weight_val) const {
return __float2half(float_functor(__half2float(input_val), __half2float(target_val), return __float2half(float_functor(__half2float(input_val), __half2float(target_val),
__half2float(dy_val), __half2float(weight_val))); __half2float(dy_val), __half2float(weight_val)));
} }
}; };
template<typename T> template<typename T>
class BinaryCrossEntropyKernel final : public user_op::OpKernel { class BinaryCrossEntropyKernel final : public user_op::OpKernel {
public: public:
BinaryCrossEntropyKernel() = default; BinaryCrossEntropyKernel() = default;
~BinaryCrossEntropyKernel() = default; ~BinaryCrossEntropyKernel() = default;
private: private:
using user_op::OpKernel::Compute; using user_op::OpKernel::Compute;
void Compute(user_op::KernelComputeContext* ctx) const override { void Compute(user_op::KernelComputeContext* ctx) const override {
const auto* input_blob = ctx->Tensor4ArgNameAndIndex("input", 0); const auto* input_blob = ctx->Tensor4ArgNameAndIndex("input", 0);
const auto* target_blob = ctx->Tensor4ArgNameAndIndex("target", 0); const auto* target_blob = ctx->Tensor4ArgNameAndIndex("target", 0);
auto* out_blob = ctx->Tensor4ArgNameAndIndex("out", 0); auto* out_blob = ctx->Tensor4ArgNameAndIndex("out", 0);
const int64_t elem_cnt = input_blob->shape_view().elem_cnt(); const int64_t elem_cnt = input_blob->shape_view().elem_cnt();
const T* input = input_blob->dptr<T>(); const T* input = input_blob->dptr<T>();
const T* target = target_blob->dptr<T>(); const T* target = target_blob->dptr<T>();
T* out = out_blob->mut_dptr<T>(); T* out = out_blob->mut_dptr<T>();
if (ctx->has_input("weight", 0)) { if (ctx->has_input("weight", 0)) {
const T* weight = ctx->Tensor4ArgNameAndIndex("weight", 0)->dptr<T>(); const T* weight = ctx->Tensor4ArgNameAndIndex("weight", 0)->dptr<T>();
OF_CUDA_CHECK( OF_CUDA_CHECK(
(cuda::elementwise::Ternary(BinaryCrossEntropyFunctor<T>(), elem_cnt, out, input, target, (cuda::elementwise::Ternary(BinaryCrossEntropyFunctor<T>(), elem_cnt, out, input, target,
weight, ctx->stream()->As<ep::CudaStream>()->cuda_stream()))); weight, ctx->stream()->As<ep::CudaStream>()->cuda_stream())));
} else { } else {
OF_CUDA_CHECK( OF_CUDA_CHECK(
(cuda::elementwise::Binary(BinaryCrossEntropyFunctor<T>(), elem_cnt, out, input, target, (cuda::elementwise::Binary(BinaryCrossEntropyFunctor<T>(), elem_cnt, out, input, target,
ctx->stream()->As<ep::CudaStream>()->cuda_stream()))); ctx->stream()->As<ep::CudaStream>()->cuda_stream())));
} }
} }
bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
}; };
template<typename T> template<typename T>
class BinaryCrossEntropyGradKernel final : public user_op::OpKernel { class BinaryCrossEntropyGradKernel final : public user_op::OpKernel {
public: public:
BinaryCrossEntropyGradKernel() = default; BinaryCrossEntropyGradKernel() = default;
~BinaryCrossEntropyGradKernel() = default; ~BinaryCrossEntropyGradKernel() = default;
private: private:
using user_op::OpKernel::Compute; using user_op::OpKernel::Compute;
void Compute(user_op::KernelComputeContext* ctx) const override { void Compute(user_op::KernelComputeContext* ctx) const override {
const auto* input_blob = ctx->Tensor4ArgNameAndIndex("input", 0); const auto* input_blob = ctx->Tensor4ArgNameAndIndex("input", 0);
const auto* target_blob = ctx->Tensor4ArgNameAndIndex("target", 0); const auto* target_blob = ctx->Tensor4ArgNameAndIndex("target", 0);
const auto* dy_blob = ctx->Tensor4ArgNameAndIndex("dy", 0); const auto* dy_blob = ctx->Tensor4ArgNameAndIndex("dy", 0);
auto* dx_blob = ctx->Tensor4ArgNameAndIndex("dx", 0); auto* dx_blob = ctx->Tensor4ArgNameAndIndex("dx", 0);
const int64_t elem_cnt = input_blob->shape_view().elem_cnt(); const int64_t elem_cnt = input_blob->shape_view().elem_cnt();
const T* dy = dy_blob->dptr<T>(); const T* dy = dy_blob->dptr<T>();
const T* input = input_blob->dptr<T>(); const T* input = input_blob->dptr<T>();
const T* target = target_blob->dptr<T>(); const T* target = target_blob->dptr<T>();
T* dx = dx_blob->mut_dptr<T>(); T* dx = dx_blob->mut_dptr<T>();
if (ctx->has_input("weight", 0)) { if (ctx->has_input("weight", 0)) {
const T* weight = ctx->Tensor4ArgNameAndIndex("weight", 0)->dptr<T>(); const T* weight = ctx->Tensor4ArgNameAndIndex("weight", 0)->dptr<T>();
using FunctorT = BinaryCrossEntropyGradFunctor<T>; using FunctorT = BinaryCrossEntropyGradFunctor<T>;
using FactoryT = cuda::elementwise::SimpleFactory<FunctorT>; using FactoryT = cuda::elementwise::SimpleFactory<FunctorT>;
OF_CUDA_CHECK((cuda::elementwise::GenericLauncher<FactoryT, T, T, T, T, T>::Launch( OF_CUDA_CHECK((cuda::elementwise::GenericLauncher<FactoryT, T, T, T, T, T>::Launch(
FactoryT(FunctorT()), elem_cnt, dx, input, target, dy, weight, FactoryT(FunctorT()), elem_cnt, dx, input, target, dy, weight,
ctx->stream()->As<ep::CudaStream>()->cuda_stream()))); ctx->stream()->As<ep::CudaStream>()->cuda_stream())));
} else { } else {
OF_CUDA_CHECK((cuda::elementwise::Ternary( OF_CUDA_CHECK((cuda::elementwise::Ternary(
BinaryCrossEntropyGradFunctor<T>(), elem_cnt, dx, input, target, dy, BinaryCrossEntropyGradFunctor<T>(), elem_cnt, dx, input, target, dy,
ctx->stream()->As<ep::CudaStream>()->cuda_stream()))); ctx->stream()->As<ep::CudaStream>()->cuda_stream())));
} }
} }
bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
}; };
} // namespace } // namespace
#define REGISTER_BINARY_CROSS_ENTROPY_KERNEL(dtype) \ #define REGISTER_BINARY_CROSS_ENTROPY_KERNEL(dtype) \
REGISTER_USER_KERNEL("binary_cross_entropy") \ REGISTER_USER_KERNEL("binary_cross_entropy") \
.SetCreateFn<BinaryCrossEntropyKernel<dtype>>() \ .SetCreateFn<BinaryCrossEntropyKernel<dtype>>() \
.SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \
&& (user_op::HobDataType("input", 0) == GetDataType<dtype>::value) \ && (user_op::HobDataType("input", 0) == GetDataType<dtype>::value) \
&& (user_op::HobDataType("target", 0) == GetDataType<dtype>::value) \ && (user_op::HobDataType("target", 0) == GetDataType<dtype>::value) \
&& (user_op::HobDataType("out", 0) == GetDataType<dtype>::value)); && (user_op::HobDataType("out", 0) == GetDataType<dtype>::value));
#define REGISTER_BINARY_CROSS_ENTROPY_GRAD_KERNEL(dtype) \ #define REGISTER_BINARY_CROSS_ENTROPY_GRAD_KERNEL(dtype) \
REGISTER_USER_KERNEL("binary_cross_entropy_grad") \ REGISTER_USER_KERNEL("binary_cross_entropy_grad") \
.SetCreateFn<BinaryCrossEntropyGradKernel<dtype>>() \ .SetCreateFn<BinaryCrossEntropyGradKernel<dtype>>() \
.SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \
&& (user_op::HobDataType("input", 0) == GetDataType<dtype>::value) \ && (user_op::HobDataType("input", 0) == GetDataType<dtype>::value) \
&& (user_op::HobDataType("target", 0) == GetDataType<dtype>::value) \ && (user_op::HobDataType("target", 0) == GetDataType<dtype>::value) \
&& (user_op::HobDataType("dy", 0) == GetDataType<dtype>::value) \ && (user_op::HobDataType("dy", 0) == GetDataType<dtype>::value) \
&& (user_op::HobDataType("dx", 0) == GetDataType<dtype>::value)); && (user_op::HobDataType("dx", 0) == GetDataType<dtype>::value));
REGISTER_BINARY_CROSS_ENTROPY_KERNEL(half) REGISTER_BINARY_CROSS_ENTROPY_KERNEL(half)
REGISTER_BINARY_CROSS_ENTROPY_KERNEL(float) REGISTER_BINARY_CROSS_ENTROPY_KERNEL(float)
REGISTER_BINARY_CROSS_ENTROPY_KERNEL(double) REGISTER_BINARY_CROSS_ENTROPY_KERNEL(double)
REGISTER_BINARY_CROSS_ENTROPY_GRAD_KERNEL(half) REGISTER_BINARY_CROSS_ENTROPY_GRAD_KERNEL(half)
REGISTER_BINARY_CROSS_ENTROPY_GRAD_KERNEL(float) REGISTER_BINARY_CROSS_ENTROPY_GRAD_KERNEL(float)
REGISTER_BINARY_CROSS_ENTROPY_GRAD_KERNEL(double) REGISTER_BINARY_CROSS_ENTROPY_GRAD_KERNEL(double)
} // namespace user_op } // namespace user_op
} // namespace oneflow } // namespace oneflow
\ No newline at end of file
/* /*
Copyright 2020 The OneFlow Authors. All rights reserved. Copyright 2020 The OneFlow Authors. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License"); Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License. you may not use this file except in compliance with the License.
You may obtain a copy of the License at You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0 http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS, distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. limitations under the License.
*/ */
#include "oneflow/core/framework/framework.h" #include "oneflow/core/framework/framework.h"
#include "oneflow/core/hip/elementwise.hip.h" #include "oneflow/core/hip/elementwise.hip.h"
#include "oneflow/core/ndarray/ndarray_util.h" #include "oneflow/core/ndarray/ndarray_util.h"
#include "oneflow/core/ndarray/xpu_var_ndarray.h" #include "oneflow/core/ndarray/xpu_var_ndarray.h"
#include "oneflow/user/kernels/loss_kernel_util.h" #include "oneflow/user/kernels/loss_kernel_util.h"
#include "oneflow/core/ep/rocm/cuda_stream.h" #include "oneflow/core/ep/rocm/cuda_stream.h"
namespace oneflow { namespace oneflow {
namespace user_op { namespace user_op {
namespace { namespace {
using namespace loss; using namespace loss;
enum class WeightType { enum class WeightType {
kNone, kNone,
kWeight, kWeight,
kPosWeight, kPosWeight,
kBoth, kBoth,
}; };
template<typename T, WeightType WEIGHT_TYPE> template<typename T, WeightType WEIGHT_TYPE>
struct BinaryCrossEntropyWithLogitsFunctor; struct BinaryCrossEntropyWithLogitsFunctor;
template<typename T> template<typename T>
struct BinaryCrossEntropyWithLogitsFunctor<T, WeightType::kNone> { struct BinaryCrossEntropyWithLogitsFunctor<T, WeightType::kNone> {
T zero_; T zero_;
T one_; T one_;
BinaryCrossEntropyWithLogitsFunctor() : zero_(GetZeroVal<T>()), one_(GetOneVal<T>()) {} BinaryCrossEntropyWithLogitsFunctor() : zero_(GetZeroVal<T>()), one_(GetOneVal<T>()) {}
__device__ __forceinline__ T operator()(T input_val, T target_val) const { __device__ __forceinline__ T operator()(T input_val, T target_val) const {
const T max_val = -input_val < zero_ ? zero_ : -input_val; const T max_val = -input_val < zero_ ? zero_ : -input_val;
return (one_ - target_val) * input_val + max_val return (one_ - target_val) * input_val + max_val
+ (log(exp(-max_val) + exp(-input_val - max_val))); + (log(exp(-max_val) + exp(-input_val - max_val)));
} }
}; };
template<typename T> template<typename T>
struct BinaryCrossEntropyWithLogitsFunctor<T, WeightType::kPosWeight> { struct BinaryCrossEntropyWithLogitsFunctor<T, WeightType::kPosWeight> {
T zero_; T zero_;
T one_; T one_;
BinaryCrossEntropyWithLogitsFunctor() : zero_(GetZeroVal<T>()), one_(GetOneVal<T>()) {} BinaryCrossEntropyWithLogitsFunctor() : zero_(GetZeroVal<T>()), one_(GetOneVal<T>()) {}
__device__ __forceinline__ T operator()(T input_val, T target_val, T weight_val) const { __device__ __forceinline__ T operator()(T input_val, T target_val, T weight_val) const {
const T max_val = -input_val < zero_ ? zero_ : -input_val; const T max_val = -input_val < zero_ ? zero_ : -input_val;
const T pos_weight_processed_val = weight_val - target_val + one_; const T pos_weight_processed_val = weight_val - target_val + one_;
return (one_ - target_val) * input_val return (one_ - target_val) * input_val
+ (pos_weight_processed_val + (pos_weight_processed_val
* (log(exp(-max_val) + exp(-input_val - max_val)) + max_val)); * (log(exp(-max_val) + exp(-input_val - max_val)) + max_val));
} }
}; };
template<> template<>
struct BinaryCrossEntropyWithLogitsFunctor<float, WeightType::kNone> { struct BinaryCrossEntropyWithLogitsFunctor<float, WeightType::kNone> {
float zero_; float zero_;
float one_; float one_;
BinaryCrossEntropyWithLogitsFunctor() : zero_(0.f), one_(1.f) {} BinaryCrossEntropyWithLogitsFunctor() : zero_(0.f), one_(1.f) {}
__device__ __forceinline__ float operator()(float input_val, float target_val) const { __device__ __forceinline__ float operator()(float input_val, float target_val) const {
const float max_val = -input_val < zero_ ? zero_ : -input_val; const float max_val = -input_val < zero_ ? zero_ : -input_val;
return (one_ - target_val) * input_val + max_val return (one_ - target_val) * input_val + max_val
+ (logf(expf(-max_val) + expf(-input_val - max_val))); + (logf(expf(-max_val) + expf(-input_val - max_val)));
} }
}; };
template<> template<>
struct BinaryCrossEntropyWithLogitsFunctor<float, WeightType::kPosWeight> { struct BinaryCrossEntropyWithLogitsFunctor<float, WeightType::kPosWeight> {
float zero_; float zero_;
float one_; float one_;
BinaryCrossEntropyWithLogitsFunctor() : zero_(0.f), one_(1.f) {} BinaryCrossEntropyWithLogitsFunctor() : zero_(0.f), one_(1.f) {}
__device__ __forceinline__ float operator()(float input_val, float target_val, __device__ __forceinline__ float operator()(float input_val, float target_val,
float weight_val) const { float weight_val) const {
const float max_val = -input_val < zero_ ? zero_ : -input_val; const float max_val = -input_val < zero_ ? zero_ : -input_val;
const float pos_weight_processed_val = weight_val - target_val + one_; const float pos_weight_processed_val = weight_val - target_val + one_;
return (one_ - target_val) * input_val return (one_ - target_val) * input_val
+ (pos_weight_processed_val + (pos_weight_processed_val
* (logf(expf(-max_val) + expf(-input_val - max_val)) + max_val)); * (logf(expf(-max_val) + expf(-input_val - max_val)) + max_val));
} }
}; };
template<typename T> template<typename T>
struct BinaryCrossEntropyWithLogitsFunctor<T, WeightType::kWeight> { struct BinaryCrossEntropyWithLogitsFunctor<T, WeightType::kWeight> {
BinaryCrossEntropyWithLogitsFunctor<T, WeightType::kNone> f; BinaryCrossEntropyWithLogitsFunctor<T, WeightType::kNone> f;
__device__ __forceinline__ T operator()(T input_val, T target_val, T weight_val) const { __device__ __forceinline__ T operator()(T input_val, T target_val, T weight_val) const {
return f(input_val, target_val) * weight_val; return f(input_val, target_val) * weight_val;
} }
}; };
template<typename T> template<typename T>
struct BinaryCrossEntropyWithLogitsFunctor<T, WeightType::kBoth> { struct BinaryCrossEntropyWithLogitsFunctor<T, WeightType::kBoth> {
BinaryCrossEntropyWithLogitsFunctor<T, WeightType::kPosWeight> f; BinaryCrossEntropyWithLogitsFunctor<T, WeightType::kPosWeight> f;
__device__ __forceinline__ T operator()(T input_val, T target_val, T weight_val, __device__ __forceinline__ T operator()(T input_val, T target_val, T weight_val,
T pos_weight_val) const { T pos_weight_val) const {
return f(input_val, target_val, pos_weight_val) * weight_val; return f(input_val, target_val, pos_weight_val) * weight_val;
} }
}; };
template<> template<>
struct BinaryCrossEntropyWithLogitsFunctor<half, WeightType::kNone> { struct BinaryCrossEntropyWithLogitsFunctor<half, WeightType::kNone> {
BinaryCrossEntropyWithLogitsFunctor<float, WeightType::kNone> f; BinaryCrossEntropyWithLogitsFunctor<float, WeightType::kNone> f;
__device__ __forceinline__ half operator()(half input_val, half target_val) const { __device__ __forceinline__ half operator()(half input_val, half target_val) const {
return __float2half(f(__half2float(input_val), __half2float(target_val))); return __float2half(f(__half2float(input_val), __half2float(target_val)));
} }
}; };
template<> template<>
struct BinaryCrossEntropyWithLogitsFunctor<half, WeightType::kPosWeight> { struct BinaryCrossEntropyWithLogitsFunctor<half, WeightType::kPosWeight> {
BinaryCrossEntropyWithLogitsFunctor<float, WeightType::kPosWeight> f; BinaryCrossEntropyWithLogitsFunctor<float, WeightType::kPosWeight> f;
__device__ __forceinline__ half operator()(half input_val, half target_val, __device__ __forceinline__ half operator()(half input_val, half target_val,
half weight_val) const { half weight_val) const {
return __float2half( return __float2half(
f(__half2float(input_val), __half2float(target_val), __half2float(weight_val))); f(__half2float(input_val), __half2float(target_val), __half2float(weight_val)));
} }
}; };
template<> template<>
struct BinaryCrossEntropyWithLogitsFunctor<half, WeightType::kWeight> { struct BinaryCrossEntropyWithLogitsFunctor<half, WeightType::kWeight> {
BinaryCrossEntropyWithLogitsFunctor<float, WeightType::kWeight> f; BinaryCrossEntropyWithLogitsFunctor<float, WeightType::kWeight> f;
__device__ __forceinline__ half operator()(half input_val, half target_val, __device__ __forceinline__ half operator()(half input_val, half target_val,
half weight_val) const { half weight_val) const {
return __float2half( return __float2half(
f(__half2float(input_val), __half2float(target_val), __half2float(weight_val))); f(__half2float(input_val), __half2float(target_val), __half2float(weight_val)));
} }
}; };
template<> template<>
struct BinaryCrossEntropyWithLogitsFunctor<half, WeightType::kBoth> { struct BinaryCrossEntropyWithLogitsFunctor<half, WeightType::kBoth> {
BinaryCrossEntropyWithLogitsFunctor<float, WeightType::kBoth> f; BinaryCrossEntropyWithLogitsFunctor<float, WeightType::kBoth> f;
__device__ __forceinline__ half operator()(half input_val, half target_val, half weight_val, __device__ __forceinline__ half operator()(half input_val, half target_val, half weight_val,
half pos_weight_val) const { half pos_weight_val) const {
return __float2half(f(__half2float(input_val), __half2float(target_val), return __float2half(f(__half2float(input_val), __half2float(target_val),
__half2float(weight_val), __half2float(pos_weight_val))); __half2float(weight_val), __half2float(pos_weight_val)));
} }
}; };
template<typename T> template<typename T>
__device__ __forceinline__ T CalSigmoid(const T x) { __device__ __forceinline__ T CalSigmoid(const T x) {
const T half_of_one = static_cast<T>(0.5); const T half_of_one = static_cast<T>(0.5);
return half_of_one * tanh(half_of_one * x) + half_of_one; return half_of_one * tanh(half_of_one * x) + half_of_one;
} }
template<> template<>
__device__ __forceinline__ float CalSigmoid(const float x) { __device__ __forceinline__ float CalSigmoid(const float x) {
const float half_of_one = static_cast<float>(0.5); const float half_of_one = static_cast<float>(0.5);
return half_of_one * tanhf(half_of_one * x) + half_of_one; return half_of_one * tanhf(half_of_one * x) + half_of_one;
} }
template<> template<>
__device__ __forceinline__ half CalSigmoid(const half x) { __device__ __forceinline__ half CalSigmoid(const half x) {
return __float2half(CalSigmoid(__half2float(x))); return __float2half(CalSigmoid(__half2float(x)));
} }
template<typename T, WeightType WEIGHT_TYPE> template<typename T, WeightType WEIGHT_TYPE>
struct BinaryCrossEntropyWithLogitsGradFunctor; struct BinaryCrossEntropyWithLogitsGradFunctor;
template<typename T> template<typename T>
struct BinaryCrossEntropyWithLogitsGradFunctor<T, WeightType::kNone> { struct BinaryCrossEntropyWithLogitsGradFunctor<T, WeightType::kNone> {
__device__ __forceinline__ T operator()(T input_val, T target_val, T dy_val) const { __device__ __forceinline__ T operator()(T input_val, T target_val, T dy_val) const {
return (CalSigmoid(input_val) - target_val) * dy_val; return (CalSigmoid(input_val) - target_val) * dy_val;
} }
}; };
template<typename T> template<typename T>
struct BinaryCrossEntropyWithLogitsGradFunctor<T, WeightType::kPosWeight> { struct BinaryCrossEntropyWithLogitsGradFunctor<T, WeightType::kPosWeight> {
T one_; T one_;
BinaryCrossEntropyWithLogitsGradFunctor() : one_(GetOneVal<T>()) {} BinaryCrossEntropyWithLogitsGradFunctor() : one_(GetOneVal<T>()) {}
__device__ __forceinline__ T operator()(T input_val, T target_val, T dy_val, T weight_val) const { __device__ __forceinline__ T operator()(T input_val, T target_val, T dy_val, T weight_val) const {
return dy_val * ((weight_val + one_ - target_val) * CalSigmoid(input_val) - weight_val); return dy_val * ((weight_val + one_ - target_val) * CalSigmoid(input_val) - weight_val);
} }
}; };
template<typename T> template<typename T>
struct BinaryCrossEntropyWithLogitsGradFunctor<T, WeightType::kWeight> { struct BinaryCrossEntropyWithLogitsGradFunctor<T, WeightType::kWeight> {
BinaryCrossEntropyWithLogitsGradFunctor<T, WeightType::kNone> f; BinaryCrossEntropyWithLogitsGradFunctor<T, WeightType::kNone> f;
__device__ __forceinline__ T operator()(T input_val, T target_val, T dy_val, T weight_val) const { __device__ __forceinline__ T operator()(T input_val, T target_val, T dy_val, T weight_val) const {
return f(input_val, target_val, dy_val) * weight_val; return f(input_val, target_val, dy_val) * weight_val;
} }
}; };
template<typename T> template<typename T>
struct BinaryCrossEntropyWithLogitsGradFunctor<T, WeightType::kBoth> { struct BinaryCrossEntropyWithLogitsGradFunctor<T, WeightType::kBoth> {
BinaryCrossEntropyWithLogitsGradFunctor<T, WeightType::kPosWeight> f; BinaryCrossEntropyWithLogitsGradFunctor<T, WeightType::kPosWeight> f;
__device__ __forceinline__ T operator()(T input_val, T target_val, T dy_val, T weight_val, __device__ __forceinline__ T operator()(T input_val, T target_val, T dy_val, T weight_val,
T pos_weight_val) const { T pos_weight_val) const {
return f(input_val, target_val, dy_val, pos_weight_val) * weight_val; return f(input_val, target_val, dy_val, pos_weight_val) * weight_val;
} }
}; };
template<typename T> template<typename T>
class BinaryCrossEntropyWithLogitsKernel final : public user_op::OpKernel { class BinaryCrossEntropyWithLogitsKernel final : public user_op::OpKernel {
public: public:
BinaryCrossEntropyWithLogitsKernel() = default; BinaryCrossEntropyWithLogitsKernel() = default;
~BinaryCrossEntropyWithLogitsKernel() override = default; ~BinaryCrossEntropyWithLogitsKernel() override = default;
private: private:
using user_op::OpKernel::Compute; using user_op::OpKernel::Compute;
void Compute(user_op::KernelComputeContext* ctx) const override { void Compute(user_op::KernelComputeContext* ctx) const override {
const auto* input_blob = ctx->Tensor4ArgNameAndIndex("input", 0); const auto* input_blob = ctx->Tensor4ArgNameAndIndex("input", 0);
const auto* target_blob = ctx->Tensor4ArgNameAndIndex("target", 0); const auto* target_blob = ctx->Tensor4ArgNameAndIndex("target", 0);
auto* out_blob = ctx->Tensor4ArgNameAndIndex("out", 0); auto* out_blob = ctx->Tensor4ArgNameAndIndex("out", 0);
auto* tmp_buffer_blob = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0); auto* tmp_buffer_blob = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0);
const int64_t elem_cnt = input_blob->shape_view().elem_cnt(); const int64_t elem_cnt = input_blob->shape_view().elem_cnt();
const T* input = input_blob->dptr<T>(); const T* input = input_blob->dptr<T>();
const T* target = target_blob->dptr<T>(); const T* target = target_blob->dptr<T>();
T* out = out_blob->mut_dptr<T>(); T* out = out_blob->mut_dptr<T>();
if (ctx->Attr<bool>("has_pos_weight")) { if (ctx->Attr<bool>("has_pos_weight")) {
T* pos_weight_processed = tmp_buffer_blob->mut_dptr<T>(); T* pos_weight_processed = tmp_buffer_blob->mut_dptr<T>();
const T* pos_weight = ctx->Tensor4ArgNameAndIndex("pos_weight", 0)->dptr<T>(); const T* pos_weight = ctx->Tensor4ArgNameAndIndex("pos_weight", 0)->dptr<T>();
Shape pos_weight_shape = Shape::Ones(target_blob->shape_view().NumAxes()); Shape pos_weight_shape = Shape::Ones(target_blob->shape_view().NumAxes());
pos_weight_shape.Set(pos_weight_shape.NumAxes() - 1, pos_weight_shape.Set(pos_weight_shape.NumAxes() - 1,
ctx->Tensor4ArgNameAndIndex("pos_weight", 0)->shape_view().elem_cnt()); ctx->Tensor4ArgNameAndIndex("pos_weight", 0)->shape_view().elem_cnt());
NdarrayUtil<DeviceType::kCUDA, T>::BroadcastMul( NdarrayUtil<DeviceType::kCUDA, T>::BroadcastMul(
ctx->stream(), XpuVarNdarray<T>(target_blob->shape_view(), pos_weight_processed), ctx->stream(), XpuVarNdarray<T>(target_blob->shape_view(), pos_weight_processed),
XpuVarNdarray<const T>(pos_weight_shape, pos_weight), XpuVarNdarray<const T>(pos_weight_shape, pos_weight),
XpuVarNdarray<const T>(target_blob->shape_view(), target)); XpuVarNdarray<const T>(target_blob->shape_view(), target));
if (ctx->has_input("weight", 0)) { if (ctx->has_input("weight", 0)) {
const T* weight = ctx->Tensor4ArgNameAndIndex("weight", 0)->dptr<T>(); const T* weight = ctx->Tensor4ArgNameAndIndex("weight", 0)->dptr<T>();
using FunctorT = BinaryCrossEntropyWithLogitsFunctor<T, WeightType::kBoth>; using FunctorT = BinaryCrossEntropyWithLogitsFunctor<T, WeightType::kBoth>;
using FactoryT = cuda::elementwise::SimpleFactory<FunctorT>; using FactoryT = cuda::elementwise::SimpleFactory<FunctorT>;
OF_CUDA_CHECK((cuda::elementwise::GenericLauncher<FactoryT, T, T, T, T, T>::Launch( OF_CUDA_CHECK((cuda::elementwise::GenericLauncher<FactoryT, T, T, T, T, T>::Launch(
FactoryT(FunctorT()), elem_cnt, out, input, target, weight, pos_weight_processed, FactoryT(FunctorT()), elem_cnt, out, input, target, weight, pos_weight_processed,
ctx->stream()->As<ep::CudaStream>()->cuda_stream()))); ctx->stream()->As<ep::CudaStream>()->cuda_stream())));
} else { } else {
OF_CUDA_CHECK((cuda::elementwise::Ternary( OF_CUDA_CHECK((cuda::elementwise::Ternary(
BinaryCrossEntropyWithLogitsFunctor<T, WeightType::kPosWeight>(), elem_cnt, out, input, BinaryCrossEntropyWithLogitsFunctor<T, WeightType::kPosWeight>(), elem_cnt, out, input,
target, pos_weight_processed, ctx->stream()->As<ep::CudaStream>()->cuda_stream()))); target, pos_weight_processed, ctx->stream()->As<ep::CudaStream>()->cuda_stream())));
} }
} else { } else {
if (ctx->has_input("weight", 0)) { if (ctx->has_input("weight", 0)) {
const T* weight = ctx->Tensor4ArgNameAndIndex("weight", 0)->dptr<T>(); const T* weight = ctx->Tensor4ArgNameAndIndex("weight", 0)->dptr<T>();
OF_CUDA_CHECK((cuda::elementwise::Ternary( OF_CUDA_CHECK((cuda::elementwise::Ternary(
BinaryCrossEntropyWithLogitsFunctor<T, WeightType::kWeight>(), elem_cnt, out, input, BinaryCrossEntropyWithLogitsFunctor<T, WeightType::kWeight>(), elem_cnt, out, input,
target, weight, ctx->stream()->As<ep::CudaStream>()->cuda_stream()))); target, weight, ctx->stream()->As<ep::CudaStream>()->cuda_stream())));
} else { } else {
OF_CUDA_CHECK((cuda::elementwise::Binary( OF_CUDA_CHECK((cuda::elementwise::Binary(
BinaryCrossEntropyWithLogitsFunctor<T, WeightType::kNone>(), elem_cnt, out, input, BinaryCrossEntropyWithLogitsFunctor<T, WeightType::kNone>(), elem_cnt, out, input,
target, ctx->stream()->As<ep::CudaStream>()->cuda_stream()))); target, ctx->stream()->As<ep::CudaStream>()->cuda_stream())));
} }
} }
} }
bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
}; };
template<typename T> template<typename T>
class BinaryCrossEntropyWithLogitsGradKernel final : public user_op::OpKernel { class BinaryCrossEntropyWithLogitsGradKernel final : public user_op::OpKernel {
public: public:
BinaryCrossEntropyWithLogitsGradKernel() = default; BinaryCrossEntropyWithLogitsGradKernel() = default;
~BinaryCrossEntropyWithLogitsGradKernel() = default; ~BinaryCrossEntropyWithLogitsGradKernel() = default;
private: private:
using user_op::OpKernel::Compute; using user_op::OpKernel::Compute;
void Compute(user_op::KernelComputeContext* ctx) const override { void Compute(user_op::KernelComputeContext* ctx) const override {
const auto* input_blob = ctx->Tensor4ArgNameAndIndex("input", 0); const auto* input_blob = ctx->Tensor4ArgNameAndIndex("input", 0);
const auto* target_blob = ctx->Tensor4ArgNameAndIndex("target", 0); const auto* target_blob = ctx->Tensor4ArgNameAndIndex("target", 0);
const auto* dy_blob = ctx->Tensor4ArgNameAndIndex("dy", 0); const auto* dy_blob = ctx->Tensor4ArgNameAndIndex("dy", 0);
auto* dx_blob = ctx->Tensor4ArgNameAndIndex("dx", 0); auto* dx_blob = ctx->Tensor4ArgNameAndIndex("dx", 0);
auto* tmp_buffer_blob = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0); auto* tmp_buffer_blob = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0);
const int64_t elem_cnt = input_blob->shape_view().elem_cnt(); const int64_t elem_cnt = input_blob->shape_view().elem_cnt();
const T* dy = dy_blob->dptr<T>(); const T* dy = dy_blob->dptr<T>();
const T* input = input_blob->dptr<T>(); const T* input = input_blob->dptr<T>();
const T* target = target_blob->dptr<T>(); const T* target = target_blob->dptr<T>();
T* dx = dx_blob->mut_dptr<T>(); T* dx = dx_blob->mut_dptr<T>();
if (ctx->Attr<bool>("has_pos_weight")) { if (ctx->Attr<bool>("has_pos_weight")) {
T* pos_weight_processed = tmp_buffer_blob->mut_dptr<T>(); T* pos_weight_processed = tmp_buffer_blob->mut_dptr<T>();
const T* pos_weight = ctx->Tensor4ArgNameAndIndex("pos_weight", 0)->dptr<T>(); const T* pos_weight = ctx->Tensor4ArgNameAndIndex("pos_weight", 0)->dptr<T>();
Shape pos_weight_shape = Shape::Ones(target_blob->shape_view().NumAxes()); Shape pos_weight_shape = Shape::Ones(target_blob->shape_view().NumAxes());
pos_weight_shape.Set(pos_weight_shape.NumAxes() - 1, pos_weight_shape.Set(pos_weight_shape.NumAxes() - 1,
ctx->Tensor4ArgNameAndIndex("pos_weight", 0)->shape_view().elem_cnt()); ctx->Tensor4ArgNameAndIndex("pos_weight", 0)->shape_view().elem_cnt());
NdarrayUtil<DeviceType::kCUDA, T>::BroadcastMul( NdarrayUtil<DeviceType::kCUDA, T>::BroadcastMul(
ctx->stream(), XpuVarNdarray<T>(target_blob->shape_view(), pos_weight_processed), ctx->stream(), XpuVarNdarray<T>(target_blob->shape_view(), pos_weight_processed),
XpuVarNdarray<const T>(pos_weight_shape, pos_weight), XpuVarNdarray<const T>(pos_weight_shape, pos_weight),
XpuVarNdarray<const T>(target_blob->shape_view(), target)); XpuVarNdarray<const T>(target_blob->shape_view(), target));
if (ctx->has_input("weight", 0)) { if (ctx->has_input("weight", 0)) {
const T* weight = ctx->Tensor4ArgNameAndIndex("weight", 0)->dptr<T>(); const T* weight = ctx->Tensor4ArgNameAndIndex("weight", 0)->dptr<T>();
using FunctorT = BinaryCrossEntropyWithLogitsGradFunctor<T, WeightType::kBoth>; using FunctorT = BinaryCrossEntropyWithLogitsGradFunctor<T, WeightType::kBoth>;
using FactoryT = cuda::elementwise::SimpleFactory<FunctorT>; using FactoryT = cuda::elementwise::SimpleFactory<FunctorT>;
OF_CUDA_CHECK((cuda::elementwise::GenericLauncher<FactoryT, T, T, T, T, T, T>::Launch( OF_CUDA_CHECK((cuda::elementwise::GenericLauncher<FactoryT, T, T, T, T, T, T>::Launch(
FactoryT(FunctorT()), elem_cnt, dx, input, target, dy, weight, pos_weight_processed, FactoryT(FunctorT()), elem_cnt, dx, input, target, dy, weight, pos_weight_processed,
ctx->stream()->As<ep::CudaStream>()->cuda_stream()))); ctx->stream()->As<ep::CudaStream>()->cuda_stream())));
} else { } else {
using FunctorT = BinaryCrossEntropyWithLogitsGradFunctor<T, WeightType::kPosWeight>; using FunctorT = BinaryCrossEntropyWithLogitsGradFunctor<T, WeightType::kPosWeight>;
using FactoryT = cuda::elementwise::SimpleFactory<FunctorT>; using FactoryT = cuda::elementwise::SimpleFactory<FunctorT>;
OF_CUDA_CHECK((cuda::elementwise::GenericLauncher<FactoryT, T, T, T, T, T>::Launch( OF_CUDA_CHECK((cuda::elementwise::GenericLauncher<FactoryT, T, T, T, T, T>::Launch(
FactoryT(FunctorT()), elem_cnt, dx, input, target, dy, pos_weight_processed, FactoryT(FunctorT()), elem_cnt, dx, input, target, dy, pos_weight_processed,
ctx->stream()->As<ep::CudaStream>()->cuda_stream()))); ctx->stream()->As<ep::CudaStream>()->cuda_stream())));
} }
} else { } else {
if (ctx->has_input("weight", 0)) { if (ctx->has_input("weight", 0)) {
const T* weight = ctx->Tensor4ArgNameAndIndex("weight", 0)->dptr<T>(); const T* weight = ctx->Tensor4ArgNameAndIndex("weight", 0)->dptr<T>();
using FunctorT = BinaryCrossEntropyWithLogitsGradFunctor<T, WeightType::kWeight>; using FunctorT = BinaryCrossEntropyWithLogitsGradFunctor<T, WeightType::kWeight>;
using FactoryT = cuda::elementwise::SimpleFactory<FunctorT>; using FactoryT = cuda::elementwise::SimpleFactory<FunctorT>;
OF_CUDA_CHECK((cuda::elementwise::GenericLauncher<FactoryT, T, T, T, T, T>::Launch( OF_CUDA_CHECK((cuda::elementwise::GenericLauncher<FactoryT, T, T, T, T, T>::Launch(
FactoryT(FunctorT()), elem_cnt, dx, input, target, dy, weight, FactoryT(FunctorT()), elem_cnt, dx, input, target, dy, weight,
ctx->stream()->As<ep::CudaStream>()->cuda_stream()))); ctx->stream()->As<ep::CudaStream>()->cuda_stream())));
} else { } else {
OF_CUDA_CHECK((cuda::elementwise::Ternary( OF_CUDA_CHECK((cuda::elementwise::Ternary(
BinaryCrossEntropyWithLogitsGradFunctor<T, WeightType::kNone>(), elem_cnt, dx, input, BinaryCrossEntropyWithLogitsGradFunctor<T, WeightType::kNone>(), elem_cnt, dx, input,
target, dy, ctx->stream()->As<ep::CudaStream>()->cuda_stream()))); target, dy, ctx->stream()->As<ep::CudaStream>()->cuda_stream())));
} }
} }
} }
bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
}; };
template<typename T> template<typename T>
user_op::InferTmpSizeFn GenFwInferTmpSizeFn() { user_op::InferTmpSizeFn GenFwInferTmpSizeFn() {
return [](user_op::InferContext* ctx) { return [](user_op::InferContext* ctx) {
const int64_t n = ctx->InputShape("input", 0).elem_cnt(); const int64_t n = ctx->InputShape("input", 0).elem_cnt();
size_t tmp_buffer_size = 0; size_t tmp_buffer_size = 0;
if (ctx->Attr<bool>("has_pos_weight")) { tmp_buffer_size += GetCudaAlignedSize(n * sizeof(T)); } if (ctx->Attr<bool>("has_pos_weight")) { tmp_buffer_size += GetCudaAlignedSize(n * sizeof(T)); }
return tmp_buffer_size; return tmp_buffer_size;
}; };
} }
template<typename T> template<typename T>
user_op::InferTmpSizeFn GenBwInferTmpSizeFn() { user_op::InferTmpSizeFn GenBwInferTmpSizeFn() {
return [](user_op::InferContext* ctx) { return [](user_op::InferContext* ctx) {
const int64_t n = ctx->InputShape("target", 0).elem_cnt(); const int64_t n = ctx->InputShape("target", 0).elem_cnt();
size_t tmp_buffer_size = 0; size_t tmp_buffer_size = 0;
if (ctx->Attr<bool>("has_pos_weight")) { tmp_buffer_size += GetCudaAlignedSize(n * sizeof(T)); } if (ctx->Attr<bool>("has_pos_weight")) { tmp_buffer_size += GetCudaAlignedSize(n * sizeof(T)); }
return tmp_buffer_size; return tmp_buffer_size;
}; };
} }
} // namespace } // namespace
#define REGISTER_BINARY_CROSS_ENTROPY_KERNEL(dtype) \ #define REGISTER_BINARY_CROSS_ENTROPY_KERNEL(dtype) \
REGISTER_USER_KERNEL("binary_cross_entropy_with_logits") \ REGISTER_USER_KERNEL("binary_cross_entropy_with_logits") \
.SetCreateFn<BinaryCrossEntropyWithLogitsKernel<dtype>>() \ .SetCreateFn<BinaryCrossEntropyWithLogitsKernel<dtype>>() \
.SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \
&& (user_op::HobDataType("input", 0) == GetDataType<dtype>::value) \ && (user_op::HobDataType("input", 0) == GetDataType<dtype>::value) \
&& (user_op::HobDataType("target", 0) == GetDataType<dtype>::value) \ && (user_op::HobDataType("target", 0) == GetDataType<dtype>::value) \
&& (user_op::HobDataType("out", 0) == GetDataType<dtype>::value)) \ && (user_op::HobDataType("out", 0) == GetDataType<dtype>::value)) \
.SetInferTmpSizeFn(GenFwInferTmpSizeFn<dtype>()); .SetInferTmpSizeFn(GenFwInferTmpSizeFn<dtype>());
#define REGISTER_BINARY_CROSS_ENTROPY_GRAD_KERNEL(dtype) \ #define REGISTER_BINARY_CROSS_ENTROPY_GRAD_KERNEL(dtype) \
REGISTER_USER_KERNEL("binary_cross_entropy_with_logits_grad") \ REGISTER_USER_KERNEL("binary_cross_entropy_with_logits_grad") \
.SetCreateFn<BinaryCrossEntropyWithLogitsGradKernel<dtype>>() \ .SetCreateFn<BinaryCrossEntropyWithLogitsGradKernel<dtype>>() \
.SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \
&& (user_op::HobDataType("input", 0) == GetDataType<dtype>::value) \ && (user_op::HobDataType("input", 0) == GetDataType<dtype>::value) \
&& (user_op::HobDataType("target", 0) == GetDataType<dtype>::value) \ && (user_op::HobDataType("target", 0) == GetDataType<dtype>::value) \
&& (user_op::HobDataType("dy", 0) == GetDataType<dtype>::value) \ && (user_op::HobDataType("dy", 0) == GetDataType<dtype>::value) \
&& (user_op::HobDataType("dx", 0) == GetDataType<dtype>::value)) \ && (user_op::HobDataType("dx", 0) == GetDataType<dtype>::value)) \
.SetInferTmpSizeFn(GenBwInferTmpSizeFn<dtype>()); .SetInferTmpSizeFn(GenBwInferTmpSizeFn<dtype>());
REGISTER_BINARY_CROSS_ENTROPY_KERNEL(half) REGISTER_BINARY_CROSS_ENTROPY_KERNEL(half)
REGISTER_BINARY_CROSS_ENTROPY_KERNEL(float) REGISTER_BINARY_CROSS_ENTROPY_KERNEL(float)
REGISTER_BINARY_CROSS_ENTROPY_KERNEL(double) REGISTER_BINARY_CROSS_ENTROPY_KERNEL(double)
REGISTER_BINARY_CROSS_ENTROPY_GRAD_KERNEL(half) REGISTER_BINARY_CROSS_ENTROPY_GRAD_KERNEL(half)
REGISTER_BINARY_CROSS_ENTROPY_GRAD_KERNEL(float) REGISTER_BINARY_CROSS_ENTROPY_GRAD_KERNEL(float)
REGISTER_BINARY_CROSS_ENTROPY_GRAD_KERNEL(double) REGISTER_BINARY_CROSS_ENTROPY_GRAD_KERNEL(double)
} // namespace user_op } // namespace user_op
} // namespace oneflow } // namespace oneflow
\ No newline at end of file
#include "hip/hip_runtime.h" #include "hip/hip_runtime.h"
/* /*
Copyright 2020 The OneFlow Authors. All rights reserved. Copyright 2020 The OneFlow Authors. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License"); Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License. you may not use this file except in compliance with the License.
You may obtain a copy of the License at You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0 http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS, distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. limitations under the License.
*/ */
#include "oneflow/user/kernels/binary_cross_entropy_with_logits_mean_kernel_util.h" #include "oneflow/user/kernels/binary_cross_entropy_with_logits_mean_kernel_util.h"
#include "oneflow/core/ep/rocm/cuda_stream.h" #include "oneflow/core/ep/rocm/cuda_stream.h"
#include "oneflow/core/hip/elementwise.hip.h" #include "oneflow/core/hip/elementwise.hip.h"
#include <hipcub/hipcub.hpp> #include <hipcub/hipcub.hpp>
#include "oneflow/core/kernel/cuda_graph_support.h" #include "oneflow/core/kernel/cuda_graph_support.h"
namespace oneflow { namespace oneflow {
namespace user_op { namespace user_op {
namespace { namespace {
constexpr int32_t kBlockSize = 1024; constexpr int32_t kBlockSize = 1024;
constexpr int32_t kReduceLocalSumBlockSize = 1024; constexpr int32_t kReduceLocalSumBlockSize = 1024;
constexpr int32_t kSingleBlockProcessNumThreshold = 1024; constexpr int32_t kSingleBlockProcessNumThreshold = 1024;
template<typename T> template<typename T>
struct DefaultComputeType { struct DefaultComputeType {
using type = T; using type = T;
}; };
template<> template<>
struct DefaultComputeType<half> { struct DefaultComputeType<half> {
using type = float; using type = float;
}; };
template<class Func> template<class Func>
inline hipError_t GetNumBlocks(Func func, int64_t block_size, size_t dynamic_smem_size, inline hipError_t GetNumBlocks(Func func, int64_t block_size, size_t dynamic_smem_size,
int64_t max_blocks, int64_t waves, int* num_blocks) { int64_t max_blocks, int64_t waves, int* num_blocks) {
int dev; int dev;
{ {
hipError_t err = hipGetDevice(&dev); hipError_t err = hipGetDevice(&dev);
if (err != hipSuccess) { return err; } if (err != hipSuccess) { return err; }
} }
int sm_count; int sm_count;
{ {
hipError_t err = hipDeviceGetAttribute(&sm_count, hipDeviceAttributeMultiprocessorCount, dev); hipError_t err = hipDeviceGetAttribute(&sm_count, hipDeviceAttributeMultiprocessorCount, dev);
if (err != hipSuccess) { return err; } if (err != hipSuccess) { return err; }
} }
int max_active_blocks; int max_active_blocks;
{ {
hipError_t err = hipOccupancyMaxActiveBlocksPerMultiprocessor(&max_active_blocks, func, hipError_t err = hipOccupancyMaxActiveBlocksPerMultiprocessor(&max_active_blocks, func,
block_size, dynamic_smem_size); block_size, dynamic_smem_size);
} }
*num_blocks = *num_blocks =
std::max<int>(1, std::min<int64_t>(max_blocks, sm_count * max_active_blocks * waves)); std::max<int>(1, std::min<int64_t>(max_blocks, sm_count * max_active_blocks * waves));
return hipSuccess; return hipSuccess;
} }
template<typename In, typename Out, typename ComputeType> template<typename In, typename Out, typename ComputeType>
__global__ void FusedBinaryCrossEntropyWithLogitsReduceMeanKernel(const In* input, const In* target, __global__ void FusedBinaryCrossEntropyWithLogitsReduceMeanKernel(const In* input, const In* target,
Out* out, Out* out,
const int32_t local_elem_cnt, const int32_t local_elem_cnt,
const int32_t reduce_elem_cnt) { const int32_t reduce_elem_cnt) {
ComputeType zero = static_cast<ComputeType>(0.0); ComputeType zero = static_cast<ComputeType>(0.0);
ComputeType one = static_cast<ComputeType>(1.0); ComputeType one = static_cast<ComputeType>(1.0);
using BlockReduce = hipcub::BlockReduce<ComputeType, kBlockSize>; using BlockReduce = hipcub::BlockReduce<ComputeType, kBlockSize>;
__shared__ typename BlockReduce::TempStorage temp_storage; __shared__ typename BlockReduce::TempStorage temp_storage;
ComputeType reduce_sum = 0.0; ComputeType reduce_sum = 0.0;
CUDA_1D_KERNEL_LOOP(i, local_elem_cnt) { CUDA_1D_KERNEL_LOOP(i, local_elem_cnt) {
const ComputeType input_val = static_cast<ComputeType>(input[i]); const ComputeType input_val = static_cast<ComputeType>(input[i]);
const ComputeType target_val = static_cast<ComputeType>(target[i]); const ComputeType target_val = static_cast<ComputeType>(target[i]);
const ComputeType max_val = -input_val < zero ? zero : -input_val; const ComputeType max_val = -input_val < zero ? zero : -input_val;
const ComputeType result = const ComputeType result =
(one - target_val) * input_val + max_val + (log(exp(-max_val) + exp(-input_val - max_val))); (one - target_val) * input_val + max_val + (log(exp(-max_val) + exp(-input_val - max_val)));
reduce_sum += result; reduce_sum += result;
} }
const ComputeType block_reduce_sum = BlockReduce(temp_storage).Sum(reduce_sum); const ComputeType block_reduce_sum = BlockReduce(temp_storage).Sum(reduce_sum);
if (threadIdx.x == 0) { out[blockIdx.x] = static_cast<Out>(block_reduce_sum / reduce_elem_cnt); } if (threadIdx.x == 0) { out[blockIdx.x] = static_cast<Out>(block_reduce_sum / reduce_elem_cnt); }
} }
template<typename Out, typename ComputeType> template<typename Out, typename ComputeType>
__global__ void ReduceLocalSumKernel(ComputeType* block_local_sum_buf, Out* out, int64_t elem_cnt) { __global__ void ReduceLocalSumKernel(ComputeType* block_local_sum_buf, Out* out, int64_t elem_cnt) {
using BlockReduce = hipcub::BlockReduce<ComputeType, kReduceLocalSumBlockSize>; using BlockReduce = hipcub::BlockReduce<ComputeType, kReduceLocalSumBlockSize>;
__shared__ typename BlockReduce::TempStorage temp_storage; __shared__ typename BlockReduce::TempStorage temp_storage;
ComputeType reduce_sum = 0.0; ComputeType reduce_sum = 0.0;
CUDA_1D_KERNEL_LOOP(i, elem_cnt) { reduce_sum += block_local_sum_buf[i]; } CUDA_1D_KERNEL_LOOP(i, elem_cnt) { reduce_sum += block_local_sum_buf[i]; }
const ComputeType block_reduce_sum = BlockReduce(temp_storage).Sum(reduce_sum); const ComputeType block_reduce_sum = BlockReduce(temp_storage).Sum(reduce_sum);
if (threadIdx.x == 0) { out[0] = static_cast<Out>(block_reduce_sum); } if (threadIdx.x == 0) { out[0] = static_cast<Out>(block_reduce_sum); }
} }
template<typename T> template<typename T>
__device__ __forceinline__ T Sigmoid(const T x) { __device__ __forceinline__ T Sigmoid(const T x) {
const T half_of_one = static_cast<T>(0.5); const T half_of_one = static_cast<T>(0.5);
return half_of_one * tanh(half_of_one * x) + half_of_one; return half_of_one * tanh(half_of_one * x) + half_of_one;
} }
template<> template<>
__device__ __forceinline__ half Sigmoid(const half x) { __device__ __forceinline__ half Sigmoid(const half x) {
return __float2half(Sigmoid(__half2float(x))); return __float2half(Sigmoid(__half2float(x)));
} }
template<typename T, typename ComputeType> template<typename T, typename ComputeType>
struct BinaryCrossEntropyWithLogitsReduceMeanGradFunctor { struct BinaryCrossEntropyWithLogitsReduceMeanGradFunctor {
OF_DEVICE_FUNC explicit BinaryCrossEntropyWithLogitsReduceMeanGradFunctor( OF_DEVICE_FUNC explicit BinaryCrossEntropyWithLogitsReduceMeanGradFunctor(
const T elem_cnt_reciprocal, const T dy) const T elem_cnt_reciprocal, const T dy)
: elem_cnt_reciprocal(elem_cnt_reciprocal), dy(dy) {} : elem_cnt_reciprocal(elem_cnt_reciprocal), dy(dy) {}
__device__ T operator()(const T input_val, const T target_val) const { __device__ T operator()(const T input_val, const T target_val) const {
return (Sigmoid(input_val) - target_val) * dy * elem_cnt_reciprocal; return (Sigmoid(input_val) - target_val) * dy * elem_cnt_reciprocal;
} }
const T dy; const T dy;
const T elem_cnt_reciprocal; const T elem_cnt_reciprocal;
}; };
template<typename T, typename ComputeType> template<typename T, typename ComputeType>
struct BinaryCrossEntropyWithLogitsReduceMeanGradDyptrFunctor { struct BinaryCrossEntropyWithLogitsReduceMeanGradDyptrFunctor {
OF_DEVICE_FUNC explicit BinaryCrossEntropyWithLogitsReduceMeanGradDyptrFunctor( OF_DEVICE_FUNC explicit BinaryCrossEntropyWithLogitsReduceMeanGradDyptrFunctor(
const int32_t elem_cnt, const T* dy_ptr) const int32_t elem_cnt, const T* dy_ptr)
: elem_cnt_reciprocal(1.0f / elem_cnt), dy_ptr(dy_ptr) {} : elem_cnt_reciprocal(1.0f / elem_cnt), dy_ptr(dy_ptr) {}
__device__ BinaryCrossEntropyWithLogitsReduceMeanGradFunctor<T, ComputeType> operator()() const { __device__ BinaryCrossEntropyWithLogitsReduceMeanGradFunctor<T, ComputeType> operator()() const {
return BinaryCrossEntropyWithLogitsReduceMeanGradFunctor<T, ComputeType>(elem_cnt_reciprocal, return BinaryCrossEntropyWithLogitsReduceMeanGradFunctor<T, ComputeType>(elem_cnt_reciprocal,
*dy_ptr); *dy_ptr);
} }
const T* dy_ptr; const T* dy_ptr;
const T elem_cnt_reciprocal; const T elem_cnt_reciprocal;
}; };
template<typename T> template<typename T>
class BinaryCrossEntropyWithLogitsMeanKernel final : public user_op::OpKernel, class BinaryCrossEntropyWithLogitsMeanKernel final : public user_op::OpKernel,
public CudaGraphSupport { public CudaGraphSupport {
public: public:
BinaryCrossEntropyWithLogitsMeanKernel() = default; BinaryCrossEntropyWithLogitsMeanKernel() = default;
~BinaryCrossEntropyWithLogitsMeanKernel() override = default; ~BinaryCrossEntropyWithLogitsMeanKernel() override = default;
bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
std::shared_ptr<user_op::OpKernelCache> InitOpKernelCache( std::shared_ptr<user_op::OpKernelCache> InitOpKernelCache(
user_op::KernelCacheContext* ctx) const override { user_op::KernelCacheContext* ctx) const override {
return CreateBCEWithLogitsReduceMeanKernelCache(ctx); return CreateBCEWithLogitsReduceMeanKernelCache(ctx);
} }
private: private:
using user_op::OpKernel::Compute; using user_op::OpKernel::Compute;
void Compute(user_op::KernelComputeContext* ctx, user_op::OpKernelState* state, void Compute(user_op::KernelComputeContext* ctx, user_op::OpKernelState* state,
const user_op::OpKernelCache* cache) const override { const user_op::OpKernelCache* cache) const override {
const auto* input_blob = ctx->Tensor4ArgNameAndIndex("input", 0); const auto* input_blob = ctx->Tensor4ArgNameAndIndex("input", 0);
const auto* target_blob = ctx->Tensor4ArgNameAndIndex("target", 0); const auto* target_blob = ctx->Tensor4ArgNameAndIndex("target", 0);
auto* out_blob = ctx->Tensor4ArgNameAndIndex("out", 0); auto* out_blob = ctx->Tensor4ArgNameAndIndex("out", 0);
int64_t local_elem_cnt = input_blob->shape_view().elem_cnt(); int64_t local_elem_cnt = input_blob->shape_view().elem_cnt();
int64_t reduce_elem_cnt = local_elem_cnt; int64_t reduce_elem_cnt = local_elem_cnt;
if (cache != nullptr) { if (cache != nullptr) {
// Because `out`'s SBP maybe P or B, we need to use reduce_elem_cnt as reduce_mean factor. // Because `out`'s SBP maybe P or B, we need to use reduce_elem_cnt as reduce_mean factor.
const auto* bce_cache = dynamic_cast<const BCEWithLogitsReduceMeanKernelCache*>(cache); const auto* bce_cache = dynamic_cast<const BCEWithLogitsReduceMeanKernelCache*>(cache);
CHECK_NOTNULL(bce_cache); CHECK_NOTNULL(bce_cache);
reduce_elem_cnt = bce_cache->reduce_elem_cnt(); reduce_elem_cnt = bce_cache->reduce_elem_cnt();
} }
const T* input = input_blob->dptr<T>(); const T* input = input_blob->dptr<T>();
const T* target = target_blob->dptr<T>(); const T* target = target_blob->dptr<T>();
T* out = out_blob->mut_dptr<T>(); T* out = out_blob->mut_dptr<T>();
using ComputeType = typename DefaultComputeType<T>::type; using ComputeType = typename DefaultComputeType<T>::type;
if (local_elem_cnt <= kSingleBlockProcessNumThreshold) { if (local_elem_cnt <= kSingleBlockProcessNumThreshold) {
FusedBinaryCrossEntropyWithLogitsReduceMeanKernel<T, T, ComputeType> FusedBinaryCrossEntropyWithLogitsReduceMeanKernel<T, T, ComputeType>
<<<1, kBlockSize, 0, ctx->stream()->As<ep::CudaStream>()->cuda_stream()>>>( <<<1, kBlockSize, 0, ctx->stream()->As<ep::CudaStream>()->cuda_stream()>>>(
input_blob->dptr<T>(), target_blob->dptr<T>(), out_blob->mut_dptr<T>(), input_blob->dptr<T>(), target_blob->dptr<T>(), out_blob->mut_dptr<T>(),
local_elem_cnt, reduce_elem_cnt); local_elem_cnt, reduce_elem_cnt);
} else { } else {
auto* tmp_buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0); auto* tmp_buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0);
const int64_t tmp_buffer_elem_cnt = tmp_buffer->shape_view().elem_cnt() / sizeof(T); const int64_t tmp_buffer_elem_cnt = tmp_buffer->shape_view().elem_cnt() / sizeof(T);
const int64_t block_num = (local_elem_cnt + kBlockSize - 1) / kBlockSize; const int64_t block_num = (local_elem_cnt + kBlockSize - 1) / kBlockSize;
int launch_block = block_num; int launch_block = block_num;
OF_CUDA_CHECK(GetNumBlocks( OF_CUDA_CHECK(GetNumBlocks(
FusedBinaryCrossEntropyWithLogitsReduceMeanKernel<T, ComputeType, ComputeType>, FusedBinaryCrossEntropyWithLogitsReduceMeanKernel<T, ComputeType, ComputeType>,
kBlockSize, 0, block_num, 32, &launch_block)); kBlockSize, 0, block_num, 32, &launch_block));
launch_block = std::min<int32_t>(tmp_buffer_elem_cnt, launch_block); launch_block = std::min<int32_t>(tmp_buffer_elem_cnt, launch_block);
FusedBinaryCrossEntropyWithLogitsReduceMeanKernel<T, ComputeType, ComputeType> FusedBinaryCrossEntropyWithLogitsReduceMeanKernel<T, ComputeType, ComputeType>
<<<launch_block, kBlockSize, 0, ctx->stream()->As<ep::CudaStream>()->cuda_stream()>>>( <<<launch_block, kBlockSize, 0, ctx->stream()->As<ep::CudaStream>()->cuda_stream()>>>(
input_blob->dptr<T>(), target_blob->dptr<T>(), tmp_buffer->mut_dptr<ComputeType>(), input_blob->dptr<T>(), target_blob->dptr<T>(), tmp_buffer->mut_dptr<ComputeType>(),
local_elem_cnt, reduce_elem_cnt); local_elem_cnt, reduce_elem_cnt);
ReduceLocalSumKernel<T, ComputeType> ReduceLocalSumKernel<T, ComputeType>
<<<1, kReduceLocalSumBlockSize, 0, ctx->stream()->As<ep::CudaStream>()->cuda_stream()>>>( <<<1, kReduceLocalSumBlockSize, 0, ctx->stream()->As<ep::CudaStream>()->cuda_stream()>>>(
tmp_buffer->mut_dptr<ComputeType>(), out_blob->mut_dptr<T>(), block_num); tmp_buffer->mut_dptr<ComputeType>(), out_blob->mut_dptr<T>(), block_num);
} }
} }
}; };
template<typename T> template<typename T>
class BinaryCrossEntropyWithLogitsReduceMeanGradKernel final : public user_op::OpKernel { class BinaryCrossEntropyWithLogitsReduceMeanGradKernel final : public user_op::OpKernel {
public: public:
BinaryCrossEntropyWithLogitsReduceMeanGradKernel() = default; BinaryCrossEntropyWithLogitsReduceMeanGradKernel() = default;
~BinaryCrossEntropyWithLogitsReduceMeanGradKernel() = default; ~BinaryCrossEntropyWithLogitsReduceMeanGradKernel() = default;
bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
std::shared_ptr<user_op::OpKernelCache> InitOpKernelCache( std::shared_ptr<user_op::OpKernelCache> InitOpKernelCache(
user_op::KernelCacheContext* ctx) const override { user_op::KernelCacheContext* ctx) const override {
return CreateBCEWithLogitsReduceMeanKernelCache(ctx); return CreateBCEWithLogitsReduceMeanKernelCache(ctx);
} }
private: private:
using user_op::OpKernel::Compute; using user_op::OpKernel::Compute;
void Compute(user_op::KernelComputeContext* ctx, user_op::OpKernelState* state, void Compute(user_op::KernelComputeContext* ctx, user_op::OpKernelState* state,
const user_op::OpKernelCache* cache) const override { const user_op::OpKernelCache* cache) const override {
const auto* input_blob = ctx->Tensor4ArgNameAndIndex("input", 0); const auto* input_blob = ctx->Tensor4ArgNameAndIndex("input", 0);
const auto* target_blob = ctx->Tensor4ArgNameAndIndex("target", 0); const auto* target_blob = ctx->Tensor4ArgNameAndIndex("target", 0);
const auto* dy_blob = ctx->Tensor4ArgNameAndIndex("dy", 0); const auto* dy_blob = ctx->Tensor4ArgNameAndIndex("dy", 0);
auto* dx_blob = ctx->Tensor4ArgNameAndIndex("dx", 0); auto* dx_blob = ctx->Tensor4ArgNameAndIndex("dx", 0);
int64_t local_elem_cnt = input_blob->shape_view().elem_cnt(); int64_t local_elem_cnt = input_blob->shape_view().elem_cnt();
int64_t reduce_elem_cnt = local_elem_cnt; int64_t reduce_elem_cnt = local_elem_cnt;
if (cache != nullptr) { if (cache != nullptr) {
// Because `out`'s SBP maybe P or B, we need to use reduce_elem_cnt as reduce_mean factor. // Because `out`'s SBP maybe P or B, we need to use reduce_elem_cnt as reduce_mean factor.
const auto* bce_cache = dynamic_cast<const BCEWithLogitsReduceMeanKernelCache*>(cache); const auto* bce_cache = dynamic_cast<const BCEWithLogitsReduceMeanKernelCache*>(cache);
CHECK_NOTNULL(bce_cache); CHECK_NOTNULL(bce_cache);
reduce_elem_cnt = bce_cache->reduce_elem_cnt(); reduce_elem_cnt = bce_cache->reduce_elem_cnt();
} }
const T* dy = dy_blob->dptr<T>(); const T* dy = dy_blob->dptr<T>();
const T* input = input_blob->dptr<T>(); const T* input = input_blob->dptr<T>();
const T* target = target_blob->dptr<T>(); const T* target = target_blob->dptr<T>();
T* dx = dx_blob->mut_dptr<T>(); T* dx = dx_blob->mut_dptr<T>();
using ComputeType = typename DefaultComputeType<T>::type; using ComputeType = typename DefaultComputeType<T>::type;
OF_CUDA_CHECK((cuda::elementwise::BinaryWithFactory( OF_CUDA_CHECK((cuda::elementwise::BinaryWithFactory(
BinaryCrossEntropyWithLogitsReduceMeanGradDyptrFunctor<T, ComputeType>(reduce_elem_cnt, dy), BinaryCrossEntropyWithLogitsReduceMeanGradDyptrFunctor<T, ComputeType>(reduce_elem_cnt, dy),
local_elem_cnt, dx, input, target, ctx->stream()->As<ep::CudaStream>()->cuda_stream()))); local_elem_cnt, dx, input, target, ctx->stream()->As<ep::CudaStream>()->cuda_stream())));
} }
}; };
} // namespace } // namespace
#define REGISTER_BINARY_CROSS_ENTROPY_REDUCE_MEAN_KERNEL(dtype) \ #define REGISTER_BINARY_CROSS_ENTROPY_REDUCE_MEAN_KERNEL(dtype) \
REGISTER_USER_KERNEL("binary_cross_entropy_with_logits_reduce_mean") \ REGISTER_USER_KERNEL("binary_cross_entropy_with_logits_reduce_mean") \
.SetCreateFn<BinaryCrossEntropyWithLogitsMeanKernel<dtype>>() \ .SetCreateFn<BinaryCrossEntropyWithLogitsMeanKernel<dtype>>() \
.SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \
&& (user_op::HobDataType("input", 0) == GetDataType<dtype>::value) \ && (user_op::HobDataType("input", 0) == GetDataType<dtype>::value) \
&& (user_op::HobDataType("target", 0) == GetDataType<dtype>::value) \ && (user_op::HobDataType("target", 0) == GetDataType<dtype>::value) \
&& (user_op::HobDataType("out", 0) == GetDataType<dtype>::value)) \ && (user_op::HobDataType("out", 0) == GetDataType<dtype>::value)) \
.SetInferTmpSizeFn([](user_op::InferContext* ctx) { \ .SetInferTmpSizeFn([](user_op::InferContext* ctx) { \
const int64_t elem_cnt = ctx->InputShape("input", 0).elem_cnt(); \ const int64_t elem_cnt = ctx->InputShape("input", 0).elem_cnt(); \
const int64_t block_num = (elem_cnt + kBlockSize - 1) / kBlockSize; \ const int64_t block_num = (elem_cnt + kBlockSize - 1) / kBlockSize; \
int launch_block = block_num; \ int launch_block = block_num; \
using ComputeType = typename DefaultComputeType<dtype>::type; \ using ComputeType = typename DefaultComputeType<dtype>::type; \
OF_CUDA_CHECK(GetNumBlocks( \ OF_CUDA_CHECK(GetNumBlocks( \
FusedBinaryCrossEntropyWithLogitsReduceMeanKernel<dtype, ComputeType, ComputeType>, \ FusedBinaryCrossEntropyWithLogitsReduceMeanKernel<dtype, ComputeType, ComputeType>, \
kBlockSize, 0, block_num, 32, &launch_block)); \ kBlockSize, 0, block_num, 32, &launch_block)); \
const int64_t tmp_buffer_size = GetCudaAlignedSize(launch_block * sizeof(dtype)); \ const int64_t tmp_buffer_size = GetCudaAlignedSize(launch_block * sizeof(dtype)); \
return tmp_buffer_size; \ return tmp_buffer_size; \
}); });
#define REGISTER_BINARY_CROSS_ENTROPY_REDUCE_MEAN_GRAD_KERNEL(dtype) \ #define REGISTER_BINARY_CROSS_ENTROPY_REDUCE_MEAN_GRAD_KERNEL(dtype) \
REGISTER_USER_KERNEL("binary_cross_entropy_with_logits_reduce_mean_grad") \ REGISTER_USER_KERNEL("binary_cross_entropy_with_logits_reduce_mean_grad") \
.SetCreateFn<BinaryCrossEntropyWithLogitsReduceMeanGradKernel<dtype>>() \ .SetCreateFn<BinaryCrossEntropyWithLogitsReduceMeanGradKernel<dtype>>() \
.SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \ .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \
&& (user_op::HobDataType("input", 0) == GetDataType<dtype>::value) \ && (user_op::HobDataType("input", 0) == GetDataType<dtype>::value) \
&& (user_op::HobDataType("target", 0) == GetDataType<dtype>::value) \ && (user_op::HobDataType("target", 0) == GetDataType<dtype>::value) \
&& (user_op::HobDataType("dy", 0) == GetDataType<dtype>::value) \ && (user_op::HobDataType("dy", 0) == GetDataType<dtype>::value) \
&& (user_op::HobDataType("dx", 0) == GetDataType<dtype>::value)); && (user_op::HobDataType("dx", 0) == GetDataType<dtype>::value));
REGISTER_BINARY_CROSS_ENTROPY_REDUCE_MEAN_KERNEL(half) REGISTER_BINARY_CROSS_ENTROPY_REDUCE_MEAN_KERNEL(half)
REGISTER_BINARY_CROSS_ENTROPY_REDUCE_MEAN_KERNEL(float) REGISTER_BINARY_CROSS_ENTROPY_REDUCE_MEAN_KERNEL(float)
REGISTER_BINARY_CROSS_ENTROPY_REDUCE_MEAN_KERNEL(double) REGISTER_BINARY_CROSS_ENTROPY_REDUCE_MEAN_KERNEL(double)
REGISTER_BINARY_CROSS_ENTROPY_REDUCE_MEAN_GRAD_KERNEL(half) REGISTER_BINARY_CROSS_ENTROPY_REDUCE_MEAN_GRAD_KERNEL(half)
REGISTER_BINARY_CROSS_ENTROPY_REDUCE_MEAN_GRAD_KERNEL(float) REGISTER_BINARY_CROSS_ENTROPY_REDUCE_MEAN_GRAD_KERNEL(float)
REGISTER_BINARY_CROSS_ENTROPY_REDUCE_MEAN_GRAD_KERNEL(double) REGISTER_BINARY_CROSS_ENTROPY_REDUCE_MEAN_GRAD_KERNEL(double)
} // namespace user_op } // namespace user_op
} // namespace oneflow } // namespace oneflow
\ No newline at end of file
/* /*
Copyright 2020 The OneFlow Authors. All rights reserved. Copyright 2020 The OneFlow Authors. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License"); Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License. you may not use this file except in compliance with the License.
You may obtain a copy of the License at You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0 http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS, distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. limitations under the License.
*/ */
#include "hip/hip_runtime.h" #include "hip/hip_runtime.h"
#include "oneflow/core/kernel/new_kernel_util.h" #include "oneflow/core/kernel/new_kernel_util.h"
#include "oneflow/core/kernel/kernel_util.hip.h" #include "oneflow/core/kernel/kernel_util.hip.h"
#include "oneflow/core/framework/framework.h" #include "oneflow/core/framework/framework.h"
#include "oneflow/core/ndarray/ndarray_util.h" #include "oneflow/core/ndarray/ndarray_util.h"
#include "oneflow/core/ndarray/xpu_var_ndarray.h" #include "oneflow/core/ndarray/xpu_var_ndarray.h"
#include "oneflow/core/ep/rocm/cuda_stream.h" #include "oneflow/core/ep/rocm/cuda_stream.h"
namespace oneflow { namespace oneflow {
namespace { namespace {
template<typename T> template<typename T>
__global__ void ComputeLogGpu(const int64_t len, T* out, const T* in) { __global__ void ComputeLogGpu(const int64_t len, T* out, const T* in) {
CUDA_1D_KERNEL_LOOP(i, len) { out[i] = SafeLog(in[i]); } CUDA_1D_KERNEL_LOOP(i, len) { out[i] = SafeLog(in[i]); }
} }
template<> template<>
__global__ void ComputeLogGpu<float16>(const int64_t len, float16* out, const float16* in) { __global__ void ComputeLogGpu<float16>(const int64_t len, float16* out, const float16* in) {
const half* _in = reinterpret_cast<const half*>(in); const half* _in = reinterpret_cast<const half*>(in);
half* _out = reinterpret_cast<half*>(out); half* _out = reinterpret_cast<half*>(out);
CUDA_1D_KERNEL_LOOP(i, len) { _out[i] = SafeLog(_in[i]); } CUDA_1D_KERNEL_LOOP(i, len) { _out[i] = SafeLog(_in[i]); }
} }
template<DeviceType device, typename T> template<DeviceType device, typename T>
class BroadcastPowYGradKernel final : public user_op::OpKernel { class BroadcastPowYGradKernel final : public user_op::OpKernel {
public: public:
BroadcastPowYGradKernel() = default; BroadcastPowYGradKernel() = default;
~BroadcastPowYGradKernel() = default; ~BroadcastPowYGradKernel() = default;
private: private:
using user_op::OpKernel::Compute; using user_op::OpKernel::Compute;
void Compute(user_op::KernelComputeContext* ctx) const override { void Compute(user_op::KernelComputeContext* ctx) const override {
const user_op::Tensor* x_tensor = ctx->Tensor4ArgNameAndIndex("x", 0); const user_op::Tensor* x_tensor = ctx->Tensor4ArgNameAndIndex("x", 0);
const user_op::Tensor* z_tensor = ctx->Tensor4ArgNameAndIndex("z", 0); const user_op::Tensor* z_tensor = ctx->Tensor4ArgNameAndIndex("z", 0);
const user_op::Tensor* dz_tensor = ctx->Tensor4ArgNameAndIndex("dz", 0); const user_op::Tensor* dz_tensor = ctx->Tensor4ArgNameAndIndex("dz", 0);
user_op::Tensor* tmp_buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0); user_op::Tensor* tmp_buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0);
user_op::Tensor* dy_tensor = ctx->Tensor4ArgNameAndIndex("dy", 0); user_op::Tensor* dy_tensor = ctx->Tensor4ArgNameAndIndex("dy", 0);
const int64_t num_axes = dz_tensor->shape_view().NumAxes(); const int64_t num_axes = dz_tensor->shape_view().NumAxes();
const int64_t elem_cnt = z_tensor->shape_view().elem_cnt(); const int64_t elem_cnt = z_tensor->shape_view().elem_cnt();
Memset<device>(ctx->stream(), tmp_buffer->mut_dptr<T>(), 0, Memset<device>(ctx->stream(), tmp_buffer->mut_dptr<T>(), 0,
GetCudaAlignedSize(elem_cnt * sizeof(T))); GetCudaAlignedSize(elem_cnt * sizeof(T)));
XpuVarNdarray<const T> z(z_tensor->shape_view(), z_tensor->dptr<T>(), num_axes); XpuVarNdarray<const T> z(z_tensor->shape_view(), z_tensor->dptr<T>(), num_axes);
XpuVarNdarray<const T> dz(dz_tensor->shape_view(), dz_tensor->dptr<T>(), num_axes); XpuVarNdarray<const T> dz(dz_tensor->shape_view(), dz_tensor->dptr<T>(), num_axes);
XpuVarNdarray<const T> const_tmp(dz.shape(), tmp_buffer->dptr<T>()); XpuVarNdarray<const T> const_tmp(dz.shape(), tmp_buffer->dptr<T>());
XpuVarNdarray<T> tmp(dz.shape(), tmp_buffer->mut_dptr<T>()); XpuVarNdarray<T> tmp(dz.shape(), tmp_buffer->mut_dptr<T>());
XpuVarNdarray<const T> x(x_tensor->shape_view(), x_tensor->dptr<T>(), num_axes); XpuVarNdarray<const T> x(x_tensor->shape_view(), x_tensor->dptr<T>(), num_axes);
XpuVarNdarray<T> dy(dy_tensor->shape_view(), dy_tensor->mut_dptr<T>(), num_axes); XpuVarNdarray<T> dy(dy_tensor->shape_view(), dy_tensor->mut_dptr<T>(), num_axes);
NdarrayUtil<device, T>::BroadcastAdd(ctx->stream(), tmp, x, const_tmp); NdarrayUtil<device, T>::BroadcastAdd(ctx->stream(), tmp, x, const_tmp);
ComputeLogGpu<T><<<BlocksNum4ThreadsNum(elem_cnt), kCudaThreadsNumPerBlock, 0, ComputeLogGpu<T><<<BlocksNum4ThreadsNum(elem_cnt), kCudaThreadsNumPerBlock, 0,
ctx->stream()->As<ep::CudaStream>()->cuda_stream()>>>( ctx->stream()->As<ep::CudaStream>()->cuda_stream()>>>(
elem_cnt, tmp_buffer->mut_dptr<T>(), tmp_buffer->dptr<T>()); elem_cnt, tmp_buffer->mut_dptr<T>(), tmp_buffer->dptr<T>());
NdarrayUtil<device, T>::BroadcastMul(ctx->stream(), tmp, dz, const_tmp); NdarrayUtil<device, T>::BroadcastMul(ctx->stream(), tmp, dz, const_tmp);
NdarrayUtil<device, T>::BroadcastMul(ctx->stream(), tmp, z, const_tmp); NdarrayUtil<device, T>::BroadcastMul(ctx->stream(), tmp, z, const_tmp);
NdarrayUtil<device, T>::ReduceSum(ctx->stream(), dy, const_tmp, tmp); NdarrayUtil<device, T>::ReduceSum(ctx->stream(), dy, const_tmp, tmp);
} }
bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; } bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
}; };
} // namespace } // namespace
#define REGISTER_BROADCAST_POW_Y_GRAD_KERNEL(device, dtype_pair) \ #define REGISTER_BROADCAST_POW_Y_GRAD_KERNEL(device, dtype_pair) \
REGISTER_USER_KERNEL("broadcast_pow_y_grad") \ REGISTER_USER_KERNEL("broadcast_pow_y_grad") \
.SetCreateFn<BroadcastPowYGradKernel<device, OF_PP_PAIR_FIRST(dtype_pair)>>() \ .SetCreateFn<BroadcastPowYGradKernel<device, OF_PP_PAIR_FIRST(dtype_pair)>>() \
.SetIsMatchedHob((user_op::HobDeviceType() == device) \ .SetIsMatchedHob((user_op::HobDeviceType() == device) \
&& (user_op::HobDataType("x", 0) == OF_PP_PAIR_SECOND(dtype_pair))) \ && (user_op::HobDataType("x", 0) == OF_PP_PAIR_SECOND(dtype_pair))) \
.SetInferTmpSizeFn([](oneflow::user_op::InferContext* ctx) { \ .SetInferTmpSizeFn([](oneflow::user_op::InferContext* ctx) { \
const user_op::TensorDesc& z = ctx->InputTensorDesc("z", 0); \ const user_op::TensorDesc& z = ctx->InputTensorDesc("z", 0); \
const DataType& data_type = z.data_type(); \ const DataType& data_type = z.data_type(); \
const int64_t elem_cnt = z.shape().elem_cnt(); \ const int64_t elem_cnt = z.shape().elem_cnt(); \
return GetCudaAlignedSize(elem_cnt * GetSizeOfDataType(data_type)); \ return GetCudaAlignedSize(elem_cnt * GetSizeOfDataType(data_type)); \
}); });
OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(REGISTER_BROADCAST_POW_Y_GRAD_KERNEL, (DeviceType::kCUDA), OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(REGISTER_BROADCAST_POW_Y_GRAD_KERNEL, (DeviceType::kCUDA),
ARITHMETIC_DATA_TYPE_SEQ FLOAT16_DATA_TYPE_SEQ) ARITHMETIC_DATA_TYPE_SEQ FLOAT16_DATA_TYPE_SEQ)
} // namespace oneflow } // namespace oneflow
\ No newline at end of file
#include "hip/hip_runtime.h" #include "hip/hip_runtime.h"
/* /*
Copyright 2020 The OneFlow Authors. All rights reserved. Copyright 2020 The OneFlow Authors. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License"); Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License. you may not use this file except in compliance with the License.
You may obtain a copy of the License at You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0 http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS, distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. limitations under the License.
*/ */
#ifdef NDEBUG #ifdef NDEBUG
#undef NDEBUG #undef NDEBUG
#endif #endif
#include <assert.h> #include <assert.h>
#include "oneflow/user/kernels/categorical_ordinal_encode_kernel_util.h" #include "oneflow/user/kernels/categorical_ordinal_encode_kernel_util.h"
#include "oneflow/core/kernel/kernel_util.hip.h" #include "oneflow/core/kernel/kernel_util.hip.h"
#include "oneflow/core/ep/rocm/cuda_stream.h" #include "oneflow/core/ep/rocm/cuda_stream.h"
namespace oneflow { namespace oneflow {
namespace { namespace {
using CuInt64T = unsigned long long int; using CuInt64T = unsigned long long int;
__device__ __inline__ int32_t AtomicCAS(int32_t* address, int32_t compare, int32_t val) { __device__ __inline__ int32_t AtomicCAS(int32_t* address, int32_t compare, int32_t val) {
return atomicCAS(address, compare, val); return atomicCAS(address, compare, val);
} }
__device__ __inline__ int64_t AtomicCAS(int64_t* address, int64_t compare, int64_t val) { __device__ __inline__ int64_t AtomicCAS(int64_t* address, int64_t compare, int64_t val) {
static_assert(sizeof(int64_t) == sizeof(CuInt64T), "size error"); static_assert(sizeof(int64_t) == sizeof(CuInt64T), "size error");
return static_cast<int64_t>(atomicCAS(reinterpret_cast<CuInt64T*>(address), return static_cast<int64_t>(atomicCAS(reinterpret_cast<CuInt64T*>(address),
static_cast<CuInt64T>(compare), static_cast<CuInt64T>(compare),
static_cast<CuInt64T>(val))); static_cast<CuInt64T>(val)));
} }
__device__ __inline__ int32_t AtomicAdd(int32_t* address, int32_t val) { __device__ __inline__ int32_t AtomicAdd(int32_t* address, int32_t val) {
return atomicAdd(address, val); return atomicAdd(address, val);
} }
__device__ __inline__ int64_t AtomicAdd(int64_t* address, int64_t val) { __device__ __inline__ int64_t AtomicAdd(int64_t* address, int64_t val) {
static_assert(sizeof(int64_t) == sizeof(CuInt64T), "size error"); static_assert(sizeof(int64_t) == sizeof(CuInt64T), "size error");
return static_cast<int64_t>( return static_cast<int64_t>(
atomicAdd(reinterpret_cast<CuInt64T*>(address), static_cast<CuInt64T>(val))); atomicAdd(reinterpret_cast<CuInt64T*>(address), static_cast<CuInt64T>(val)));
} }
template<typename K, typename V> template<typename K, typename V>
__device__ bool TryGetOrInsert(K* key, volatile V* value, V* size, const K hash, V* out) { __device__ bool TryGetOrInsert(K* key, volatile V* value, V* size, const K hash, V* out) {
K old_key = AtomicCAS(key, static_cast<K>(0), hash); K old_key = AtomicCAS(key, static_cast<K>(0), hash);
if (old_key == 0) { if (old_key == 0) {
V v = AtomicAdd(size, 1) + 1; V v = AtomicAdd(size, 1) + 1;
*value = v; *value = v;
*out = v; *out = v;
return true; return true;
} else if (old_key == hash) { } else if (old_key == hash) {
while (true) { while (true) {
V v = *value; V v = *value;
if (v != 0) { if (v != 0) {
*out = v; *out = v;
break; break;
} }
} }
return true; return true;
} else { } else {
return false; return false;
} }
} }
template<typename T> template<typename T>
__device__ bool GetOrInsertOne(const size_t capacity, T* table, T* size, const T hash, T* out) { __device__ bool GetOrInsertOne(const size_t capacity, T* table, T* size, const T hash, T* out) {
if (hash == 0) { if (hash == 0) {
*out = 0; *out = 0;
return true; return true;
} }
const size_t start_idx = static_cast<size_t>(hash) % capacity; const size_t start_idx = static_cast<size_t>(hash) % capacity;
// fast path // fast path
{ {
T* key = table + start_idx * 2; T* key = table + start_idx * 2;
T* value = key + 1; T* value = key + 1;
if (*key == hash && *value != 0) { if (*key == hash && *value != 0) {
*out = *value; *out = *value;
return true; return true;
} }
} }
for (size_t count = 0; count < capacity; ++count) { for (size_t count = 0; count < capacity; ++count) {
const size_t idx = (start_idx + count) % capacity; const size_t idx = (start_idx + count) % capacity;
T* key = table + idx * 2; T* key = table + idx * 2;
T* value = key + 1; T* value = key + 1;
if (TryGetOrInsert<T, T>(key, value, size, hash, out)) { return true; } if (TryGetOrInsert<T, T>(key, value, size, hash, out)) { return true; }
} }
return false; return false;
} }
template<typename T> template<typename T>
__global__ void EncodeGpu(const size_t capacity, T* table, T* size, const int64_t n, const T* hash, __global__ void EncodeGpu(const size_t capacity, T* table, T* size, const int64_t n, const T* hash,
T* out) { T* out) {
CUDA_1D_KERNEL_LOOP(i, n) { CUDA_1D_KERNEL_LOOP(i, n) {
bool success = GetOrInsertOne<T>(capacity, table, size, hash[i], out + i); bool success = GetOrInsertOne<T>(capacity, table, size, hash[i], out + i);
assert(success); assert(success);
} }
} }
} // namespace } // namespace
template<typename T> template<typename T>
struct CategoricalOrdinalEncodeKernelUtil<DeviceType::kCUDA, T> { struct CategoricalOrdinalEncodeKernelUtil<DeviceType::kCUDA, T> {
static void Encode(ep::Stream* stream, int64_t capacity, T* table, T* size, int64_t n, static void Encode(ep::Stream* stream, int64_t capacity, T* table, T* size, int64_t n,
const T* hash, T* out) { const T* hash, T* out) {
EncodeGpu<T> EncodeGpu<T>
<<<BlocksNum4ThreadsNum(n), kCudaThreadsNumPerBlock, 0, <<<BlocksNum4ThreadsNum(n), kCudaThreadsNumPerBlock, 0,
stream->As<ep::CudaStream>()->cuda_stream()>>>(capacity, table, size, n, hash, out); stream->As<ep::CudaStream>()->cuda_stream()>>>(capacity, table, size, n, hash, out);
} }
}; };
#define INSTANTIATE_CATEGORICAL_ORDINAL_ENCODE_KERNEL_UTIL_CUDA(type_cpp, type_proto) \ #define INSTANTIATE_CATEGORICAL_ORDINAL_ENCODE_KERNEL_UTIL_CUDA(type_cpp, type_proto) \
template struct CategoricalOrdinalEncodeKernelUtil<DeviceType::kCUDA, type_cpp>; template struct CategoricalOrdinalEncodeKernelUtil<DeviceType::kCUDA, type_cpp>;
OF_PP_FOR_EACH_TUPLE(INSTANTIATE_CATEGORICAL_ORDINAL_ENCODE_KERNEL_UTIL_CUDA, INDEX_DATA_TYPE_SEQ); OF_PP_FOR_EACH_TUPLE(INSTANTIATE_CATEGORICAL_ORDINAL_ENCODE_KERNEL_UTIL_CUDA, INDEX_DATA_TYPE_SEQ);
#undef INSTANTIATE_CATEGORICAL_ORDINAL_ENCODE_KERNEL_UTIL_CUDA #undef INSTANTIATE_CATEGORICAL_ORDINAL_ENCODE_KERNEL_UTIL_CUDA
} // namespace oneflow } // namespace oneflow
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment