Commit a715222c authored by yuguo's avatar yuguo
Browse files

0.9.1-rocm

parent f262efc9
......@@ -60,3 +60,51 @@ REGISTER_PRIMITIVE_FACTORY(DeviceType::kCUDA, MemcpyFactory, MemcpyFactoryImpl);
} // namespace oneflow
#endif
#ifdef WITH_ROCM
#include "oneflow/core/ep/include/primitive/memcpy.h"
#include "oneflow/core/ep/cuda/cuda_stream.h"
#include <hip/hip_runtime.h>
namespace oneflow {
namespace ep {
namespace primitive {
namespace {
class MemcpyImpl : public Memcpy {
public:
OF_DISALLOW_COPY_AND_MOVE(MemcpyImpl);
MemcpyImpl() = default;
~MemcpyImpl() override = default;
void Launch(Stream* stream, void* dst, const void* src, size_t count) override {
if (dst == src) { return; }
auto* cuda_stream = stream->As<CudaStream>();
OF_CUDA_CHECK(hipMemcpyAsync(dst, src, count, hipMemcpyDefault, cuda_stream->cuda_stream()));
}
};
class MemcpyFactoryImpl : public MemcpyFactory {
public:
OF_DISALLOW_COPY_AND_MOVE(MemcpyFactoryImpl);
MemcpyFactoryImpl() = default;
~MemcpyFactoryImpl() override = default;
std::unique_ptr<Memcpy> New(MemcpyKind kind) override {
return std::unique_ptr<Memcpy>(new MemcpyImpl());
}
};
REGISTER_PRIMITIVE_FACTORY(DeviceType::kCUDA, MemcpyFactory, MemcpyFactoryImpl);
} // namespace
} // namespace primitive
} // namespace ep
} // namespace oneflow
#endif
......@@ -57,3 +57,49 @@ REGISTER_PRIMITIVE_FACTORY(DeviceType::kCUDA, MemsetFactory, MemsetFactoryImpl);
} // namespace oneflow
#endif
#ifdef WITH_ROCM
#include "oneflow/core/ep/include/primitive/memset.h"
#include "oneflow/core/ep/cuda/cuda_stream.h"
#include <hip/hip_runtime.h>
namespace oneflow {
namespace ep {
namespace primitive {
namespace {
class MemsetImpl : public Memset {
public:
OF_DISALLOW_COPY_AND_MOVE(MemsetImpl);
MemsetImpl() = default;
~MemsetImpl() override = default;
void Launch(Stream* stream, void* ptr, int value, size_t count) override {
auto* cuda_stream = stream->As<CudaStream>();
OF_CUDA_CHECK(hipMemsetAsync(ptr, value, count, cuda_stream->cuda_stream()));
}
};
class MemsetFactoryImpl : public MemsetFactory {
public:
OF_DISALLOW_COPY_AND_MOVE(MemsetFactoryImpl);
MemsetFactoryImpl() = default;
~MemsetFactoryImpl() override = default;
std::unique_ptr<Memset> New() override { return std::unique_ptr<Memset>(new MemsetImpl()); }
};
REGISTER_PRIMITIVE_FACTORY(DeviceType::kCUDA, MemsetFactory, MemsetFactoryImpl);
} // namespace
} // namespace primitive
} // namespace ep
} // namespace oneflow
#endif
......@@ -16,7 +16,11 @@ limitations under the License.
#include "oneflow/core/ep/include/primitive/permute.h"
#include "oneflow/core/ep/common/primitive/permute_impl.h"
#include "oneflow/core/ep/cuda/cuda_stream.h"
#ifdef WITH_ROCM
#include <hip/hip_runtime.h>
#else
#include <cuda_runtime.h>
#endif
namespace oneflow {
......@@ -192,7 +196,7 @@ __global__ void BatchTransposeMovement2Kernel(const void* src_ptr, void* dst_ptr
}
template<size_t num_dims, size_t movement_size, size_t tile_size, typename IndexType>
void LaunchBatchTransposeKernel(cudaStream_t& cuda_stream,
void LaunchBatchTransposeKernel(GPU(Stream_t)& cuda_stream,
const PermuteKernelParams<num_dims, IndexType>& params,
const IndexType& num_batches, const IndexType& rows,
const IndexType& cols) {
......@@ -264,7 +268,7 @@ void LaunchKernel(Stream* stream, const int64_t* src_dims, const void* src, cons
void* dst, size_t count) {
PermuteKernelParams<num_dims, IndexType> params =
MakePermuteParams<num_dims, IndexType>(src_dims, src, permutation, dst, count);
cudaStream_t cuda_stream = stream->As<CudaStream>()->cuda_stream();
GPU(Stream_t) cuda_stream = stream->As<CudaStream>()->cuda_stream();
if (num_dims == 2 || num_dims == 3) {
IndexType num_batches;
......@@ -281,10 +285,12 @@ void LaunchKernel(Stream* stream, const int64_t* src_dims, const void* src, cons
cuda_stream, params, num_batches, rows, cols);
}
} else {
if (params.count == 0) { return; }
PermuteKernel<num_dims, movement_size, IndexType>
<<<BlocksNum4ThreadsNum(params.count), kCudaThreadsNumPerBlock, 0, cuda_stream>>>(params);
}
} else {
if (params.count == 0) { return; }
PermuteKernel<num_dims, movement_size, IndexType>
<<<BlocksNum4ThreadsNum(params.count), kCudaThreadsNumPerBlock, 0, cuda_stream>>>(params);
}
......
......@@ -32,7 +32,7 @@ enum class Algorithm {
};
template<Algorithm algorithm, typename T>
void SoftmaxGpu(cudaStream_t cuda_stream, size_t rows, size_t cols, const T* x, T* y) {
void SoftmaxGpu(GPU(Stream_t) cuda_stream, size_t rows, size_t cols, const T* x, T* y) {
using ComputeType = typename cuda::softmax::DefaultComputeType<T>::type;
oneflow::cuda::softmax::DirectLoad<T, ComputeType> load(x, cols);
oneflow::cuda::softmax::DirectStore<ComputeType, T> store(y, cols);
......@@ -55,7 +55,7 @@ class SoftmaxImpl : public SoftmaxBase {
~SoftmaxImpl() override = default;
void Launch(Stream* stream, size_t rows, size_t cols, const void* x, void* y) override {
cudaStream_t cuda_stream = stream->As<CudaStream>()->cuda_stream();
GPU(Stream_t) cuda_stream = stream->As<CudaStream>()->cuda_stream();
SoftmaxGpu<algorithm, T>(cuda_stream, rows, cols, reinterpret_cast<const T*>(x),
reinterpret_cast<T*>(y));
}
......
......@@ -32,7 +32,7 @@ enum class Algorithm {
};
template<Algorithm algorithm, typename T>
void SoftmaxBackwardGpu(cudaStream_t cuda_stream, size_t rows, size_t cols, const T* y, const T* dy,
void SoftmaxBackwardGpu(GPU(Stream_t) cuda_stream, size_t rows, size_t cols, const T* y, const T* dy,
T* dx) {
using ComputeType = typename cuda::softmax::DefaultComputeType<T>::type;
cuda::softmax::DirectLoad<T, ComputeType> load_y(y, cols);
......@@ -60,7 +60,7 @@ class SoftmaxBackwardImpl : public SoftmaxBackwardBase {
void Launch(Stream* stream, size_t rows, size_t cols, const void* y, const void* dy,
void* dx) override {
cudaStream_t cuda_stream = stream->As<CudaStream>()->cuda_stream();
GPU(Stream_t) cuda_stream = stream->As<CudaStream>()->cuda_stream();
SoftmaxBackwardGpu<algorithm, T>(cuda_stream, rows, cols, reinterpret_cast<const T*>(y),
reinterpret_cast<const T*>(dy), reinterpret_cast<T*>(dx));
}
......
/*
Copyright 2020 The OneFlow Authors. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
#include "oneflow/core/ep/include/primitive/tensor_fill.h"
#include "oneflow/core/ep/cuda/primitive/type_seq.h"
#include "oneflow/core/ep/cuda/cuda_stream.h"
namespace oneflow {
namespace ep {
namespace primitive {
namespace {
template<size_t size>
using Storage = typename std::aligned_storage<size, size>::type;
template<typename T, size_t pack>
union Pack {
static constexpr size_t size = sizeof(T) * pack;
explicit __device__ __host__ Pack(const T value) {
static_assert(sizeof(Pack) == size, "");
static_assert(alignof(Pack) == size, "");
#pragma unroll
for (size_t i = 0; i < pack; ++i) { elem[i] = value; }
}
T elem[pack];
Storage<size> storage;
};
template<typename T, size_t pack>
__global__ void TensorFillGpu(T* dst, const T* value, size_t count) {
const size_t pack_count = count / pack;
const T fill_value = value[0];
Pack<T, pack> pack_value(fill_value);
auto* pack_dst = reinterpret_cast<decltype(pack_value.storage)*>(dst);
CUDA_1D_KERNEL_LOOP_T(size_t, i, pack_count) { pack_dst[i] = pack_value.storage; }
T* tail_dst = dst + pack_count * pack;
const size_t tail_count = count - pack_count * pack;
CUDA_1D_KERNEL_LOOP_T(size_t, i, tail_count) { tail_dst[i] = fill_value; }
}
template<typename T, size_t pack>
typename std::enable_if<(pack != 0), void>::type LaunchPackTensorFill(GPU(Stream_t) stream, T* dst,
const T* value,
size_t count) {
TensorFillGpu<T, pack>
<<<BlocksNum4ThreadsNum(count), kCudaThreadsNumPerBlock, 0, stream>>>(dst, value, count);
}
template<typename T, size_t pack>
typename std::enable_if<(pack == 0), void>::type LaunchPackTensorFill(GPU(Stream_t) stream, T* dst,
const T* value,
size_t count) {
LOG(FATAL) << "wrong alignment";
}
template<typename T>
void LaunchTensorFill(GPU(Stream_t) stream, T* dst, const T* value, size_t count) {
auto uintptr = reinterpret_cast<std::uintptr_t>(dst);
if (uintptr % 16 == 0) {
LaunchPackTensorFill<T, 16 / sizeof(T)>(stream, dst, value, count);
} else if (uintptr % 8 == 0) {
LaunchPackTensorFill<T, 8 / sizeof(T)>(stream, dst, value, count);
} else if (uintptr % 4 == 0) {
LaunchPackTensorFill<T, 4 / sizeof(T)>(stream, dst, value, count);
} else if (uintptr % 2 == 0) {
LaunchPackTensorFill<T, 2 / sizeof(T)>(stream, dst, value, count);
} else {
LaunchPackTensorFill<T, 1 / sizeof(T)>(stream, dst, value, count);
}
}
template<typename T>
class TensorFillImpl : public TensorFill {
public:
OF_DISALLOW_COPY_AND_MOVE(TensorFillImpl);
TensorFillImpl() = default;
~TensorFillImpl() override = default;
void Launch(Stream* stream, const void* src, void* dst, size_t count) override {
GPU(Stream_t) cuda_stream = stream->As<CudaStream>()->cuda_stream();
const T* value = reinterpret_cast<const T*>(src);
LaunchTensorFill<T>(cuda_stream, reinterpret_cast<T*>(dst), value, count);
}
};
template<typename T>
std::unique_ptr<TensorFill> NewTensorFill() {
return std::unique_ptr<TensorFill>(new TensorFillImpl<T>());
}
class TensorFillFactoryImpl : public TensorFillFactory {
public:
OF_DISALLOW_COPY_AND_MOVE(TensorFillFactoryImpl);
TensorFillFactoryImpl() = default;
~TensorFillFactoryImpl() override = default;
std::unique_ptr<TensorFill> New(DataType data_type) override {
#define MAKE_NEW_TENSOR_FILL_ENTRY(type_cpp, type_proto) {type_proto, NewTensorFill<type_cpp>},
static const std::map<DataType, std::function<std::unique_ptr<TensorFill>()>> new_fill_handle{
OF_PP_FOR_EACH_TUPLE(MAKE_NEW_TENSOR_FILL_ENTRY, CUDA_PRIMITIVE_ALL_TYPE_SEQ)};
#undef MAKE_NEW_TENSOR_FILL_ENTRY
const auto it = new_fill_handle.find(data_type);
if (it != new_fill_handle.end()) {
return it->second();
} else {
return nullptr;
}
}
};
REGISTER_PRIMITIVE_FACTORY(DeviceType::kCUDA, TensorFillFactory, TensorFillFactoryImpl);
} // namespace
} // namespace primitive
} // namespace ep
} // namespace oneflow
......@@ -63,6 +63,12 @@ limitations under the License.
CUDA_PRIMITIVE_FLOAT16_TYPE_SEQ \
CUDA_PRIMITIVE_BFLOAT16_TYPE_SEQ
#define CUDA_PRIMITIVE_INT_TYPE_SEQ \
CUDA_PRIMITIVE_UINT8_TYPE_SEQ \
CUDA_PRIMITIVE_INT8_TYPE_SEQ \
CUDA_PRIMITIVE_INT32_TYPE_SEQ \
CUDA_PRIMITIVE_INT64_TYPE_SEQ
#define UTIL_OPS_DATA_TYPE_SEQ \
CUDA_PRIMITIVE_INT8_TYPE_SEQ \
CUDA_PRIMITIVE_UINT8_TYPE_SEQ \
......@@ -75,4 +81,66 @@ limitations under the License.
#endif // WITH_CUDA
#ifdef WITH_ROCM
#include <hip/hip_runtime.h>
#include <hip/hip_fp16.h>
// #if CUDA_VERSION >= 11000
// #include <cuda_bf16.h>
// #endif // CUDA_VERSION >= 11000
#define CUDA_PRIMITIVE_BOOL_TYPE_SEQ OF_PP_MAKE_TUPLE_SEQ(bool, DataType::kBool)
#define CUDA_PRIMITIVE_CHAR_TYPE_SEQ OF_PP_MAKE_TUPLE_SEQ(char, DataType::kChar)
#define CUDA_PRIMITIVE_INT8_TYPE_SEQ OF_PP_MAKE_TUPLE_SEQ(int8_t, DataType::kInt8)
#define CUDA_PRIMITIVE_UINT8_TYPE_SEQ OF_PP_MAKE_TUPLE_SEQ(uint8_t, DataType::kUInt8)
#define CUDA_PRIMITIVE_INT32_TYPE_SEQ OF_PP_MAKE_TUPLE_SEQ(int32_t, DataType::kInt32)
#define CUDA_PRIMITIVE_UINT32_TYPE_SEQ OF_PP_MAKE_TUPLE_SEQ(uint32_t, DataType::kUInt32)
#define CUDA_PRIMITIVE_INT64_TYPE_SEQ OF_PP_MAKE_TUPLE_SEQ(int64_t, DataType::kInt64)
#define CUDA_PRIMITIVE_UINT64_TYPE_SEQ OF_PP_MAKE_TUPLE_SEQ(uint64_t, DataType::kUInt64)
#define CUDA_PRIMITIVE_FLOAT_TYPE_SEQ OF_PP_MAKE_TUPLE_SEQ(float, DataType::kFloat)
#define CUDA_PRIMITIVE_DOUBLE_TYPE_SEQ OF_PP_MAKE_TUPLE_SEQ(double, DataType::kDouble)
#define CUDA_PRIMITIVE_FLOAT16_TYPE_SEQ OF_PP_MAKE_TUPLE_SEQ(half, DataType::kFloat16)
// #if CUDA_VERSION >= 11000
// #define CUDA_PRIMITIVE_BFLOAT16_TYPE_SEQ OF_PP_MAKE_TUPLE_SEQ(nv_bfloat16, DataType::kBFloat16)
// #else
#define CUDA_PRIMITIVE_BFLOAT16_TYPE_SEQ
// #endif // CUDA_VERSION >= 11000
#define CUDA_PRIMITIVE_ALL_TYPE_SEQ \
CUDA_PRIMITIVE_BOOL_TYPE_SEQ \
CUDA_PRIMITIVE_CHAR_TYPE_SEQ \
CUDA_PRIMITIVE_INT8_TYPE_SEQ \
CUDA_PRIMITIVE_UINT8_TYPE_SEQ \
CUDA_PRIMITIVE_INT32_TYPE_SEQ \
CUDA_PRIMITIVE_INT64_TYPE_SEQ \
CUDA_PRIMITIVE_FLOAT_TYPE_SEQ \
CUDA_PRIMITIVE_DOUBLE_TYPE_SEQ \
CUDA_PRIMITIVE_FLOAT16_TYPE_SEQ \
CUDA_PRIMITIVE_BFLOAT16_TYPE_SEQ
#define CUDA_PRIMITIVE_FLOATING_TYPE_SEQ \
CUDA_PRIMITIVE_FLOAT_TYPE_SEQ \
CUDA_PRIMITIVE_DOUBLE_TYPE_SEQ \
CUDA_PRIMITIVE_FLOAT16_TYPE_SEQ \
CUDA_PRIMITIVE_BFLOAT16_TYPE_SEQ
#define CUDA_PRIMITIVE_INT_TYPE_SEQ \
CUDA_PRIMITIVE_UINT8_TYPE_SEQ \
CUDA_PRIMITIVE_INT8_TYPE_SEQ \
CUDA_PRIMITIVE_INT32_TYPE_SEQ \
CUDA_PRIMITIVE_INT64_TYPE_SEQ
#define UTIL_OPS_DATA_TYPE_SEQ \
CUDA_PRIMITIVE_INT8_TYPE_SEQ \
CUDA_PRIMITIVE_UINT8_TYPE_SEQ \
CUDA_PRIMITIVE_INT32_TYPE_SEQ \
CUDA_PRIMITIVE_INT64_TYPE_SEQ \
CUDA_PRIMITIVE_FLOAT_TYPE_SEQ \
CUDA_PRIMITIVE_DOUBLE_TYPE_SEQ \
CUDA_PRIMITIVE_FLOAT16_TYPE_SEQ \
CUDA_PRIMITIVE_BFLOAT16_TYPE_SEQ
#endif // WITH_ROCM
#endif // ONEFLOW_CORE_EP_CUDA_PRIMITIVE_TYPE_SEQ_H_
......@@ -17,14 +17,19 @@ limitations under the License.
#include "oneflow/core/ep/cuda/primitive/type_seq.h"
#include "oneflow/core/cuda/elementwise.cuh"
#include "oneflow/core/ep/cuda/cuda_stream.h"
#ifdef WITH_ROCM
#include <hip/hip_runtime.h>
#include <hip/hip_fp16.h>
#else
#include <cuda.h>
#endif
namespace oneflow {
namespace ep {
namespace primitive {
template<typename Dst, typename Src>
struct UnaryFunctor<DeviceType::kCUDA, UnaryOp::kGelu, Dst, Src> {
UnaryFunctor(Scalar attr0, Scalar attr1) {}
OF_DEVICE_FUNC UnaryFunctor(Scalar attr0, Scalar attr1) {}
OF_DEVICE_FUNC Dst operator()(Src src) const {
return static_cast<Src>(0.5) * src
......@@ -32,78 +37,236 @@ struct UnaryFunctor<DeviceType::kCUDA, UnaryOp::kGelu, Dst, Src> {
}
};
template<typename Dst, typename Src>
struct UnaryFunctor<DeviceType::kCUDA, UnaryOp::kFastGelu, Dst, Src> {
OF_DEVICE_FUNC UnaryFunctor(Scalar attr0, Scalar attr1) {}
OF_DEVICE_FUNC Dst operator()(Src src) const {
// ref to: https://mlfromscratch.com/activation-functions-explained/#gelu
const Src half = static_cast<Src>(0.5);
const Src one = static_cast<Src>(1);
const Src tanh_in = alpha * (src + beta * src * src * src);
return half * src * (one + tanh(tanh_in));
}
private:
// constant ref to:
// https://github.com/microsoft/onnxruntime/blob/main/onnxruntime/test/testdata/transform/fusion/fast_gelu.py
static constexpr Src alpha = static_cast<Src>(0.7978845608028654);
static constexpr Src beta = static_cast<Src>(0.044714998453855515);
};
template<typename Dst, typename Src>
struct UnaryFunctor<DeviceType::kCUDA, UnaryOp::kQuickGelu, Dst, Src> {
OF_DEVICE_FUNC UnaryFunctor(Scalar attr0, Scalar attr1) {}
OF_DEVICE_FUNC Dst operator()(Src src) const {
const Src sigmoid =
static_cast<Dst>(static_cast<Src>(1.0) / (static_cast<Src>(1.0) + exp(-src * alpha)));
return src * sigmoid;
}
private:
static constexpr Src alpha = static_cast<Src>(1.702);
};
namespace unary_functor_internal {
namespace {
OF_DEVICE_FUNC
float TanhApprox(float x) {
#if (__CUDA_ARCH__ >= 750 && CUDA_VERSION >= 11000)
float r;
asm("tanh.approx.f32 %0,%1; \n\t" : "=f"(r) : "f"(x));
return r;
#else
return tanhf(x);
#endif // (__CUDA_ARCH__ >= 750 && CUDA_VERSION >= 11000)
}
} // namespace
} // namespace unary_functor_internal
template<>
struct UnaryFunctor<DeviceType::kCUDA, UnaryOp::kFastGelu, half, half> {
OF_DEVICE_FUNC UnaryFunctor(Scalar attr0, Scalar attr1) : float_functor(attr0, attr1) {}
OF_DEVICE_FUNC half operator()(half src) const {
#if (__CUDA_ARCH__ >= 750 && CUDA_VERSION >= 11000)
const float tanh_in =
__half2float(__float2half_rn(alpha) * (src + __float2half_rn(beta) * src * src * src));
const float tanh_out = unary_functor_internal::TanhApprox(tanh_in);
return __float2half_rn(0.5F) * src * (__float2half_rn(1.0F) + __float2half_rn(tanh_out));
#else
return static_cast<half>(float_functor(static_cast<float>(src)));
#endif // (__CUDA_ARCH__ >= 750 && CUDA_VERSION >= 11000)
}
#if (__CUDA_ARCH__ >= 750 && CUDA_VERSION >= 11000)
__device__ void Apply2(half* dst, const half* src) const {
const half2 src2 = *(reinterpret_cast<const half2*>(src));
const float2 tanh_in = __half22float2(__hmul2(
__float2half2_rn(alpha),
__hadd2(src2, __hmul2(__hmul2(__hmul2(__float2half2_rn(beta), src2), src2), src2))));
float2 tanh_out;
tanh_out.x = unary_functor_internal::TanhApprox(tanh_in.x);
tanh_out.y = unary_functor_internal::TanhApprox(tanh_in.y);
const half2 dst2 = __hmul2(__hmul2(__float2half2_rn(0.5F), src2),
__hadd2(__float2half2_rn(1.0F), __float22half2_rn(tanh_out)));
*reinterpret_cast<half2*>(dst) = dst2;
}
#endif // (__CUDA_ARCH__ >= 750 && CUDA_VERSION >= 11000)
private:
static constexpr float alpha = 0.7978845608028654F;
static constexpr float beta = 0.044714998453855515F;
UnaryFunctor<DeviceType::kCUDA, UnaryOp::kFastGelu, float, float> float_functor;
};
template<>
struct UnaryFunctor<DeviceType::kCUDA, UnaryOp::kTanh, float, float> {
UnaryFunctor(Scalar attr0, Scalar attr1) {}
OF_DEVICE_FUNC UnaryFunctor(Scalar attr0, Scalar attr1) {}
OF_DEVICE_FUNC float operator()(float src) const { return tanhf(src); }
};
template<>
struct UnaryFunctor<DeviceType::kCUDA, UnaryOp::kTanh, double, double> {
UnaryFunctor(Scalar attr0, Scalar attr1) {}
OF_DEVICE_FUNC UnaryFunctor(Scalar attr0, Scalar attr1) {}
OF_DEVICE_FUNC double operator()(double src) const { return tanh(src); }
};
template<>
struct UnaryFunctor<DeviceType::kCUDA, UnaryOp::kTanh, half, half> {
UnaryFunctor(Scalar attr0, Scalar attr1) {}
OF_DEVICE_FUNC UnaryFunctor(Scalar attr0, Scalar attr1) {}
OF_DEVICE_FUNC half operator()(half src) const { return __float2half(tanhf(__half2float(src))); }
};
template<>
struct UnaryFunctor<DeviceType::kCUDA, UnaryOp::kIsInf, bool, half> {
UnaryFunctor(Scalar attr0, Scalar attr1) {}
OF_DEVICE_FUNC UnaryFunctor(Scalar attr0, Scalar attr1) {}
OF_DEVICE_FUNC bool operator()(half src) const { return isinf(__half2float(src)); }
};
template<>
struct UnaryFunctor<DeviceType::kCUDA, UnaryOp::kIsInf, bool, float> {
UnaryFunctor(Scalar attr0, Scalar attr1) {}
OF_DEVICE_FUNC UnaryFunctor(Scalar attr0, Scalar attr1) {}
OF_DEVICE_FUNC bool operator()(float src) const { return isinf(src); }
};
template<>
struct UnaryFunctor<DeviceType::kCUDA, UnaryOp::kIsInf, bool, double> {
UnaryFunctor(Scalar attr0, Scalar attr1) {}
OF_DEVICE_FUNC UnaryFunctor(Scalar attr0, Scalar attr1) {}
OF_DEVICE_FUNC bool operator()(double src) const { return isinf(src); }
};
template<>
struct UnaryFunctor<DeviceType::kCUDA, UnaryOp::kIsNan, bool, half> {
UnaryFunctor(Scalar attr0, Scalar attr1) {}
OF_DEVICE_FUNC UnaryFunctor(Scalar attr0, Scalar attr1) {}
OF_DEVICE_FUNC bool operator()(half src) const { return isnan(__half2float(src)); }
};
template<>
struct UnaryFunctor<DeviceType::kCUDA, UnaryOp::kIsNan, bool, float> {
UnaryFunctor(Scalar attr0, Scalar attr1) {}
OF_DEVICE_FUNC UnaryFunctor(Scalar attr0, Scalar attr1) {}
OF_DEVICE_FUNC bool operator()(float src) const { return isnan(src); }
};
template<>
struct UnaryFunctor<DeviceType::kCUDA, UnaryOp::kIsNan, bool, double> {
UnaryFunctor(Scalar attr0, Scalar attr1) {}
OF_DEVICE_FUNC UnaryFunctor(Scalar attr0, Scalar attr1) {}
OF_DEVICE_FUNC bool operator()(double src) const { return isnan(src); }
};
#define SPECIALIZATION_PSEUDO_HALF_UNARY_FUNCTOR(op) \
template<> \
struct UnaryFunctor<DeviceType::kCUDA, op, half, half> { \
UnaryFunctor(Scalar attr0, Scalar attr1) : float_functor(attr0, attr1) {} \
\
UnaryFunctor<DeviceType::kCUDA, op, float, float> float_functor; \
OF_DEVICE_FUNC half operator()(half src) const { \
return __float2half(float_functor(__half2float(src))); \
} \
template<>
struct UnaryFunctor<DeviceType::kCUDA, UnaryOp::kIsFinite, bool, half> {
OF_DEVICE_FUNC UnaryFunctor(Scalar attr0, Scalar attr1) {}
OF_DEVICE_FUNC bool operator()(half src) const { return isfinite(__half2float(src)); }
};
template<>
struct UnaryFunctor<DeviceType::kCUDA, UnaryOp::kIsFinite, bool, float> {
OF_DEVICE_FUNC UnaryFunctor(Scalar attr0, Scalar attr1) {}
OF_DEVICE_FUNC bool operator()(float src) const { return isfinite(src); }
};
template<>
struct UnaryFunctor<DeviceType::kCUDA, UnaryOp::kIsFinite, bool, double> {
OF_DEVICE_FUNC UnaryFunctor(Scalar attr0, Scalar attr1) {}
OF_DEVICE_FUNC bool operator()(double src) const { return isfinite(src); }
};
template<>
struct UnaryFunctor<DeviceType::kCUDA, UnaryOp::kTrunc, half, half> {
OF_DEVICE_FUNC UnaryFunctor(Scalar attr0, Scalar attr1) {}
__device__ half operator()(half src) const { return htrunc(src); }
};
template<>
struct UnaryFunctor<DeviceType::kCUDA, UnaryOp::kTrunc, float, float> {
OF_DEVICE_FUNC UnaryFunctor(Scalar attr0, Scalar attr1) {}
OF_DEVICE_FUNC float operator()(float src) const { return truncf(src); }
};
template<>
struct UnaryFunctor<DeviceType::kCUDA, UnaryOp::kTrunc, double, double> {
OF_DEVICE_FUNC UnaryFunctor(Scalar attr0, Scalar attr1) {}
OF_DEVICE_FUNC double operator()(double src) const { return trunc(src); }
};
template<>
struct UnaryFunctor<DeviceType::kCUDA, UnaryOp::kAbs, half, half> {
OF_DEVICE_FUNC UnaryFunctor(Scalar attr0, Scalar attr1) {}
__device__ half operator()(half src) const {
return __hlt(src, static_cast<half>(0)) ? __hneg(src) : src;
}
};
template<typename Dst, typename Src>
struct UnaryFunctor<DeviceType::kCUDA, UnaryOp::kNanAssign, Dst, Src> {
OF_DEVICE_FUNC UnaryFunctor(Scalar attr0, Scalar attr1) {}
OF_DEVICE_FUNC Dst operator()(Src src) const { return isnan(src) ? static_cast<Dst>(0.0) : src; }
};
#if CUDA_VERSION >= 11000
template<>
struct UnaryFunctor<DeviceType::kCUDA, UnaryOp::kAbs, nv_bfloat16, nv_bfloat16> {
OF_DEVICE_FUNC UnaryFunctor(Scalar attr0, Scalar attr1) {}
__device__ nv_bfloat16 operator()(nv_bfloat16 src) const {
#if CUDA_ARCH >= 800
return __habs(src);
#else
return __float2bfloat16(abs(__bfloat162float(src)));
#endif // CUDA_ARCH >= 800
}
};
#endif // CUDA_VERSION >= 11000
#define SPECIALIZATION_PSEUDO_HALF_UNARY_FUNCTOR(op) \
template<> \
struct UnaryFunctor<DeviceType::kCUDA, op, half, half> { \
OF_DEVICE_FUNC UnaryFunctor(Scalar attr0, Scalar attr1) : float_functor(attr0, attr1) {} \
\
UnaryFunctor<DeviceType::kCUDA, op, float, float> float_functor; \
OF_DEVICE_FUNC half operator()(half src) const { \
return __float2half(float_functor(__half2float(src))); \
} \
};
SPECIALIZATION_PSEUDO_HALF_UNARY_FUNCTOR(UnaryOp::kElu);
......@@ -114,20 +277,53 @@ SPECIALIZATION_PSEUDO_HALF_UNARY_FUNCTOR(UnaryOp::kSelu);
SPECIALIZATION_PSEUDO_HALF_UNARY_FUNCTOR(UnaryOp::kSilu);
SPECIALIZATION_PSEUDO_HALF_UNARY_FUNCTOR(UnaryOp::kSoftSign);
SPECIALIZATION_PSEUDO_HALF_UNARY_FUNCTOR(UnaryOp::kSoftPlus);
SPECIALIZATION_PSEUDO_HALF_UNARY_FUNCTOR(UnaryOp::kAcos);
SPECIALIZATION_PSEUDO_HALF_UNARY_FUNCTOR(UnaryOp::kAcosh);
SPECIALIZATION_PSEUDO_HALF_UNARY_FUNCTOR(UnaryOp::kAsin);
SPECIALIZATION_PSEUDO_HALF_UNARY_FUNCTOR(UnaryOp::kAsinh);
SPECIALIZATION_PSEUDO_HALF_UNARY_FUNCTOR(UnaryOp::kAtan);
SPECIALIZATION_PSEUDO_HALF_UNARY_FUNCTOR(UnaryOp::kAtanh);
SPECIALIZATION_PSEUDO_HALF_UNARY_FUNCTOR(UnaryOp::kCeil);
SPECIALIZATION_PSEUDO_HALF_UNARY_FUNCTOR(UnaryOp::kCos);
SPECIALIZATION_PSEUDO_HALF_UNARY_FUNCTOR(UnaryOp::kCosh);
SPECIALIZATION_PSEUDO_HALF_UNARY_FUNCTOR(UnaryOp::kErf);
SPECIALIZATION_PSEUDO_HALF_UNARY_FUNCTOR(UnaryOp::kErfc);
SPECIALIZATION_PSEUDO_HALF_UNARY_FUNCTOR(UnaryOp::kExp);
SPECIALIZATION_PSEUDO_HALF_UNARY_FUNCTOR(UnaryOp::kExpm1);
SPECIALIZATION_PSEUDO_HALF_UNARY_FUNCTOR(UnaryOp::kFloor);
SPECIALIZATION_PSEUDO_HALF_UNARY_FUNCTOR(UnaryOp::kLgamma);
SPECIALIZATION_PSEUDO_HALF_UNARY_FUNCTOR(UnaryOp::kLog);
SPECIALIZATION_PSEUDO_HALF_UNARY_FUNCTOR(UnaryOp::kLog2);
SPECIALIZATION_PSEUDO_HALF_UNARY_FUNCTOR(UnaryOp::kLog10);
SPECIALIZATION_PSEUDO_HALF_UNARY_FUNCTOR(UnaryOp::kLog1p);
SPECIALIZATION_PSEUDO_HALF_UNARY_FUNCTOR(UnaryOp::kLogSigmoid);
SPECIALIZATION_PSEUDO_HALF_UNARY_FUNCTOR(UnaryOp::kRint);
SPECIALIZATION_PSEUDO_HALF_UNARY_FUNCTOR(UnaryOp::kRound);
SPECIALIZATION_PSEUDO_HALF_UNARY_FUNCTOR(UnaryOp::kRsqrt);
SPECIALIZATION_PSEUDO_HALF_UNARY_FUNCTOR(UnaryOp::kSigmoid);
SPECIALIZATION_PSEUDO_HALF_UNARY_FUNCTOR(UnaryOp::kSin);
SPECIALIZATION_PSEUDO_HALF_UNARY_FUNCTOR(UnaryOp::kSinh);
SPECIALIZATION_PSEUDO_HALF_UNARY_FUNCTOR(UnaryOp::kSqrt);
SPECIALIZATION_PSEUDO_HALF_UNARY_FUNCTOR(UnaryOp::kSquare);
SPECIALIZATION_PSEUDO_HALF_UNARY_FUNCTOR(UnaryOp::kTan);
SPECIALIZATION_PSEUDO_HALF_UNARY_FUNCTOR(UnaryOp::kReciprocalNoNan);
SPECIALIZATION_PSEUDO_HALF_UNARY_FUNCTOR(UnaryOp::kNotEqualZero);
SPECIALIZATION_PSEUDO_HALF_UNARY_FUNCTOR(UnaryOp::kNanAssign);
SPECIALIZATION_PSEUDO_HALF_UNARY_FUNCTOR(UnaryOp::kQuickGelu);
/*********nv_bfloat16_kernel*******/
#if CUDA_VERSION >= 11000
#define SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR(op) \
template<> \
struct UnaryFunctor<DeviceType::kCUDA, op, nv_bfloat16, nv_bfloat16> { \
UnaryFunctor(Scalar attr0, Scalar attr1) : float_functor(attr0, attr1) {} \
\
UnaryFunctor<DeviceType::kCUDA, op, float, float> float_functor; \
OF_DEVICE_FUNC nv_bfloat16 operator()(nv_bfloat16 src) const { \
return __float2bfloat16(float_functor(__bfloat162float(src))); \
} \
#define SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR(op) \
template<> \
struct UnaryFunctor<DeviceType::kCUDA, op, nv_bfloat16, nv_bfloat16> { \
OF_DEVICE_FUNC UnaryFunctor(Scalar attr0, Scalar attr1) : float_functor(attr0, attr1) {} \
\
UnaryFunctor<DeviceType::kCUDA, op, float, float> float_functor; \
OF_DEVICE_FUNC nv_bfloat16 operator()(nv_bfloat16 src) const { \
return __float2bfloat16(float_functor(__bfloat162float(src))); \
} \
};
SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR(UnaryOp::kElu);
......@@ -146,6 +342,40 @@ SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR(UnaryOp::kSoftSign);
SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR(UnaryOp::kSoftPlus);
SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR(UnaryOp::kTanh);
SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR(UnaryOp::kThreshold);
SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR(UnaryOp::kAcos);
SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR(UnaryOp::kAcosh);
SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR(UnaryOp::kAsin);
SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR(UnaryOp::kAsinh);
SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR(UnaryOp::kAtan);
SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR(UnaryOp::kAtanh);
SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR(UnaryOp::kCeil);
SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR(UnaryOp::kCos);
SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR(UnaryOp::kCosh);
SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR(UnaryOp::kErf);
SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR(UnaryOp::kErfc);
SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR(UnaryOp::kExp);
SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR(UnaryOp::kExpm1);
SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR(UnaryOp::kFloor);
SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR(UnaryOp::kLgamma);
SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR(UnaryOp::kLog);
SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR(UnaryOp::kLog2);
SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR(UnaryOp::kLog10);
SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR(UnaryOp::kLog1p);
SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR(UnaryOp::kLogSigmoid);
SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR(UnaryOp::kRint);
SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR(UnaryOp::kRound);
SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR(UnaryOp::kRsqrt);
SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR(UnaryOp::kSigmoid);
SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR(UnaryOp::kSin);
SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR(UnaryOp::kSinh);
SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR(UnaryOp::kSqrt);
SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR(UnaryOp::kSquare);
SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR(UnaryOp::kTan);
SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR(UnaryOp::kReciprocalNoNan);
SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR(UnaryOp::kNotEqualZero);
SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR(UnaryOp::kNanAssign);
SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR(UnaryOp::kFastGelu);
SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR(UnaryOp::kQuickGelu);
template<>
struct UnaryFunctor<DeviceType::kCUDA, UnaryOp::kIsInf, bool, nv_bfloat16> {
......@@ -160,8 +390,26 @@ struct UnaryFunctor<DeviceType::kCUDA, UnaryOp::kIsNan, bool, nv_bfloat16> {
OF_DEVICE_FUNC bool operator()(nv_bfloat16 src) const { return isnan(__bfloat162float(src)); }
};
template<>
struct UnaryFunctor<DeviceType::kCUDA, UnaryOp::kIsFinite, bool, nv_bfloat16> {
UnaryFunctor(Scalar attr0, Scalar attr1) {}
#endif
OF_DEVICE_FUNC bool operator()(nv_bfloat16 src) const { return isfinite(__bfloat162float(src)); }
};
template<>
struct UnaryFunctor<DeviceType::kCUDA, UnaryOp::kTrunc, nv_bfloat16, nv_bfloat16> {
OF_DEVICE_FUNC UnaryFunctor(Scalar attr0, Scalar attr1) {}
__device__ nv_bfloat16 operator()(nv_bfloat16 src) const {
#if CUDA_ARCH >= 800
return htrunc(src);
#else
return __float2bfloat16(truncf(__bfloat162float(src)));
#endif // CUDA_ARCH >= 800
}
};
#endif // CUDA_VERSION >= 11000
} // namespace primitive
} // namespace ep
......
......@@ -21,6 +21,7 @@ limitations under the License.
#include "oneflow/core/ep/include/event.h"
#include "oneflow/core/ep/include/stream.h"
#include "oneflow/core/ep/include/allocation_options.h"
#include "oneflow/core/ep/include/gpu_macro.h"
namespace oneflow {
......
......@@ -18,6 +18,7 @@ limitations under the License.
#include "oneflow/core/common/util.h"
#include "oneflow/core/common/maybe.h"
#include "oneflow/core/ep/include/gpu_macro.h"
namespace oneflow {
......
/*
Copyright 2020 The OneFlow Authors. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
#ifndef ONEFLOW_CORE_EP_GPU_MACRO_H_
#define ONEFLOW_CORE_EP_GPU_MACRO_H_
#ifdef WITH_ROCM
#include <hip/hip_runtime.h>
#define GPU(str) hip##str
#define GPURAND(str) hiprand##str
#define GPUMultiProcessorCount hipDeviceAttributeMultiprocessorCount
#define GPUMaxThreadsPerMultiProcessor hipDeviceAttributeMaxThreadsPerMultiProcessor
#define GPUMaxSharedMemoryPerBlockOptin hipDeviceAttributeSharedMemPerBlockOptin
__device__ __forceinline__ void TRAP()
{
asm volatile("s_trap 0;");
}
#else
#include <cuda.h>
#define GPU(str) cuda##str
#define GPURAND(str) curand##str
#define GPUMultiProcessorCount cudaDevAttrMultiProcessorCount
#define GPUMaxThreadsPerMultiProcessor cudaDevAttrMaxThreadsPerMultiProcessor
#define GPUMaxSharedMemoryPerBlockOptin cudaDevAttrMaxSharedMemoryPerBlockOptin
__device__ __forceinline__ void TRAP()
{
__trap();
}
#endif
#endif // ONEFLOW_CORE_EP_GPU_MACRO_H_
\ No newline at end of file
......@@ -32,6 +32,12 @@ enum class BinaryOp {
kMax,
kMin,
kPow,
kFmod,
kFloorDiv,
kTruncDiv,
kFloorMod,
kScalarBasePowerGrad,
kScalarExpPowerGrad,
// Comparision
kEqual,
kNotEqual,
......@@ -39,6 +45,8 @@ enum class BinaryOp {
kLessEqual,
kGreaterThan,
kGreaterEqual,
kIsClose,
kIsCloseEqualNan,
// Logical
kLogicalAnd,
kLogicalOr,
......@@ -62,7 +70,35 @@ enum class BinaryOp {
kTanhBackwardWithDyX,
kThresholdBackwardWithDyX,
kSigmoidBackwardWithDyY,
kAbsBackwardWithDyX,
kAcosBackwardWithDyX,
kAcoshBackwardWithDyX,
kAsinBackwardWithDyX,
kAsinhBackwardWithDyX,
kAtanBackwardWithDyX,
kAtanhBackwardWithDyX,
kCosBackwardWithDyX,
kCoshBackwardWithDyX,
kErfBackwardWithDyX,
kErfcBackwardWithDyX,
kExpBackwardWithDyX,
kExpm1BackwardWithDyX,
kLgammaBackwardWithDyX,
kLogBackwardWithDyX,
kLog2BackwardWithDyX,
kLog10BackwardWithDyX,
kLog1pBackwardWithDyX,
kLogSigmoidBackwardWithDyX,
kReciprocalBackwardWithDyX,
kReciprocalNoNanBackwardWithDyX,
kRsqrtBackwardWithDyX,
kSinBackwardWithDyX,
kSinhBackwardWithDyX,
kSqrtBackwardWithDyX,
kSquareBackwardWithDyX,
kTanBackwardWithDyX,
kFastGeluBackwardWithDyX,
kQuickGeluBackwardWithDyX,
};
}
......
/*
Copyright 2020 The OneFlow Authors. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
#ifndef ONEFLOW_CORE_EP_PRIMITIVE_BROADCAST_ELEMENTWISE_UNARY_H_
#define ONEFLOW_CORE_EP_PRIMITIVE_BROADCAST_ELEMENTWISE_UNARY_H_
#include "oneflow/core/ep/include/primitive/primitive.h"
#include "oneflow/core/ep/include/primitive/unary_op.h"
#include "oneflow/core/common/scalar.h"
namespace oneflow {
namespace ep {
namespace primitive {
class BroadcastElementwiseUnary : public Primitive {
public:
OF_DISALLOW_COPY_AND_MOVE(BroadcastElementwiseUnary);
BroadcastElementwiseUnary() = default;
~BroadcastElementwiseUnary() override = default;
virtual void Launch(Stream* stream, size_t num_src_dims, const int64_t* src_dims,
const int64_t* src_strides, const void* src, size_t num_dst_dims,
const int64_t* dst_dims, const int64_t* dst_strides, void* dst) = 0;
virtual void Launch(Stream* stream, size_t num_src_dims, const int64_t* src_dims, const void* src,
size_t num_dst_dims, const int64_t* dst_dims, void* dst) = 0;
};
class BroadcastElementwiseUnaryFactory : public Factory<BroadcastElementwiseUnary> {
public:
OF_DISALLOW_COPY_AND_MOVE(BroadcastElementwiseUnaryFactory);
BroadcastElementwiseUnaryFactory() = default;
~BroadcastElementwiseUnaryFactory() override = default;
virtual std::unique_ptr<BroadcastElementwiseUnary> New(UnaryOp op, DataType src_type,
DataType dst_type,
size_t max_num_dims) = 0;
virtual std::unique_ptr<BroadcastElementwiseUnary> New(UnaryOp op, DataType src_type,
DataType dst_type, size_t max_num_dims,
Scalar attr0) = 0;
virtual std::unique_ptr<BroadcastElementwiseUnary> New(UnaryOp op, DataType src_type,
DataType dst_type, size_t max_num_dims,
Scalar attr0, Scalar attr1) = 0;
};
} // namespace primitive
} // namespace ep
} // namespace oneflow
#endif // ONEFLOW_CORE_EP_PRIMITIVE_BROADCAST_ELEMENTWISE_UNARY_H_
......@@ -16,8 +16,10 @@ limitations under the License.
#ifndef ONEFLOW_CORE_EP_PRIMITIVE_FAST_INTEGER_MATH_H_
#define ONEFLOW_CORE_EP_PRIMITIVE_FAST_INTEGER_MATH_H_
#include "oneflow/core/common/data_type.h"
#ifdef WITH_ROCM
#include "hip/device_functions.h" // /opt/rocm/hip/include/hip
#endif
#include <cassert>
#include "device_functions.h" // /opt/rocm/hip/include/hip
namespace oneflow {
......
/*
Copyright 2020 The OneFlow Authors. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
#ifndef ONEFLOW_CORE_EP_PRIMITIVE_TENSOR_FILL_H_
#define ONEFLOW_CORE_EP_PRIMITIVE_TENSOR_FILL_H_
#include "oneflow/core/ep/include/primitive/primitive.h"
namespace oneflow {
namespace ep {
namespace primitive {
class TensorFill : public Primitive {
public:
OF_DISALLOW_COPY_AND_MOVE(TensorFill);
TensorFill() = default;
~TensorFill() override = default;
virtual void Launch(Stream* stream, const void* src, void* dst, size_t count) = 0;
};
class TensorFillFactory : public Factory<TensorFill> {
public:
OF_DISALLOW_COPY_AND_MOVE(TensorFillFactory);
TensorFillFactory() = default;
~TensorFillFactory() override = default;
virtual std::unique_ptr<TensorFill> New(DataType data_type) = 0;
};
} // namespace primitive
} // namespace ep
} // namespace oneflow
#endif // ONEFLOW_CORE_EP_PRIMITIVE_TENSOR_FILL_H_
......@@ -22,6 +22,7 @@ namespace ep {
namespace primitive {
enum class UnaryOp {
kIdentity,
// activation op
kElu,
kCelu,
......@@ -40,13 +41,53 @@ enum class UnaryOp {
kSoftPlus,
kTanh,
kThreshold,
kFastGelu,
kQuickGelu,
// math op
kAbs,
kAcos,
kAcosh,
kAsin,
kAsinh,
kAtan,
kAtanh,
kCeil,
kCos,
kCosh,
kErf,
kErfc,
kExp,
kExpm1,
kFloor,
kLgamma,
kLog,
kLog2,
kLog10,
kLog1p,
kLogSigmoid,
kNegative,
kReciprocal,
kReciprocalNoNan,
kRint,
kRound,
kRsqrt,
kSigmoid,
kSign,
kSin,
kSinh,
kSqrt,
kSquare,
kTan,
kTrunc,
kNotEqualZero,
// logical op
kLogicalNot,
// utils op
kIsInf,
kIsNan,
kIsFinite,
kNanAssign,
};
}
......
......@@ -20,6 +20,7 @@ limitations under the License.
#include "oneflow/core/common/device_type.h"
#include "oneflow/core/common/maybe.h"
#include "oneflow/core/ep/include/event.h"
#include "oneflow/core/ep/include/gpu_macro.h"
namespace oneflow {
......@@ -37,6 +38,7 @@ class Stream {
virtual Device* device() const = 0;
virtual Maybe<void> Sync() = 0;
virtual void RecordEvent(Event* event) = 0;
virtual Maybe<void> GetAsyncError() { return Maybe<void>::Ok(); }
virtual Maybe<void> OnExecutionContextSetup() { return Maybe<void>::Ok(); }
virtual Maybe<void> OnExecutionContextTeardown() { return Maybe<void>::Ok(); }
......
/*
Copyright 2020 The OneFlow Authors. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
#include "oneflow/core/ep/rocm/cuda_device.h"
#include "oneflow/core/ep/rocm/cuda_event.h"
#include "oneflow/core/ep/rocm/cuda_stream.h"
#ifdef WITH_ROCM
#include <hip/hip_runtime.h>
#include <hip/hip_fp16.h>
// #if CUDA_VERSION >= 11000
// #include <cuda_bf16.h>
// #endif
namespace oneflow {
namespace ep {
namespace {
constexpr size_t kDefaultConstBufElementCount = 1024 * 1024;
template<typename T>
void CreateConstBuffer(void** buf, T value, size_t n) {
OF_CUDA_CHECK(hipMalloc(buf, n * sizeof(T)));
std::vector<T> host(n, value);
OF_CUDA_CHECK(hipMemcpy(*buf, host.data(), n * sizeof(T), hipMemcpyDefault));
}
} // namespace
CudaDevice::CudaDevice(int device_index, DeviceManager* device_manager)
: device_index_(device_index),
event_flags_{},
properties_{},
device_manager_(device_manager),
const_buf_elem_cnt_(0),
const_zeros_buffer_(nullptr),
const_ones_buffer_fp32_(nullptr),
const_ones_buffer_fp16_(nullptr),
const_ones_buffer_bf16_(nullptr) {
CudaCurrentDeviceGuard guard(device_index_);
OF_CUDA_CHECK(hipGetDeviceProperties(&properties_, device_index_));
event_flags_ = hipEventDisableTiming;
if (ParseBooleanFromEnv("ONEFLOW_STREAM_CUDA_EVENT_FLAG_BLOCKING_SYNC", false)) {
event_flags_ |= hipEventBlockingSync;
}
const_buf_elem_cnt_ = ParseIntegerFromEnv("ONEFLOW_EP_CUDA_CONST_BUFFER_ELEMENT_COUNT",
kDefaultConstBufElementCount);
if (const_buf_elem_cnt_ > 0) {
CreateConstBuffer<float>(&const_zeros_buffer_, static_cast<float>(0), const_buf_elem_cnt_);
CreateConstBuffer<float>(&const_ones_buffer_fp32_, static_cast<float>(1.0),
const_buf_elem_cnt_);
CreateConstBuffer<half>(&const_ones_buffer_fp16_, static_cast<half>(1.0), const_buf_elem_cnt_);
// #if CUDA_VERSION >= 11000
// CreateConstBuffer<nv_bfloat16>(&const_ones_buffer_bf16_, static_cast<nv_bfloat16>(1.0),
// const_buf_elem_cnt_);
// #endif
}
}
CudaDevice::~CudaDevice() {
CudaCurrentDeviceGuard guard(device_index_);
for (auto* event : events_) { delete event; }
OF_CUDA_CHECK(hipFree(const_zeros_buffer_));
OF_CUDA_CHECK(hipFree(const_ones_buffer_fp32_));
OF_CUDA_CHECK(hipFree(const_ones_buffer_fp16_));
OF_CUDA_CHECK(hipFree(const_ones_buffer_bf16_));
}
void CudaDevice::SetAsActiveDevice() { OF_CUDA_CHECK(hipSetDevice(device_index_)); }
Stream* CudaDevice::CreateStream() {
CudaCurrentDeviceGuard guard(device_index_);
return new CudaStream(this);
}
void CudaDevice::DestroyStream(Stream* stream) {
CudaCurrentDeviceGuard guard(device_index_);
delete stream;
}
void CudaDevice::CreateEvents(Event** events, size_t count) {
size_t copied = 0;
{
std::lock_guard<std::mutex> lock(events_mutex_);
copied = std::min(count, events_.size());
size_t offset = events_.size() - copied;
std::copy(events_.begin() + offset, events_.end(), events);
events_.resize(offset);
}
if (copied != count) {
CudaCurrentDeviceGuard guard(device_index_);
for (size_t i = copied; i < count; ++i) { events[i] = new CudaEvent(event_flags_); }
}
}
void CudaDevice::DestroyEvents(Event** events, size_t count) {
std::lock_guard<std::mutex> lock(events_mutex_);
events_.insert(events_.end(), events, events + count);
}
Maybe<void> CudaDevice::Alloc(const AllocationOptions& options, void** ptr, size_t size) {
CudaCurrentDeviceGuard guard(device_index_);
CHECK(!options.HasPinnedDevice());
hipError_t err = hipMalloc(ptr, size);
if (err != hipSuccess) {
return Error::RuntimeError() << hipGetErrorString(err);
} else {
return Maybe<void>::Ok();
}
}
void CudaDevice::Free(const AllocationOptions& attr, void* ptr) {
CudaCurrentDeviceGuard guard(device_index_);
OF_CUDA_CHECK(hipFree(ptr));
}
Maybe<void> CudaDevice::AllocPinned(const AllocationOptions& options, void** ptr, size_t size) {
CudaCurrentDeviceGuard guard(device_index_);
hipError_t err = NumaAwareCudaMallocHost(device_index_, ptr, size);
if (err != hipSuccess) {
return Error::RuntimeError() << hipGetErrorString(err);
} else {
return Maybe<void>::Ok();
}
}
void CudaDevice::FreePinned(const AllocationOptions& options, void* ptr) {
CudaCurrentDeviceGuard guard(device_index_);
OF_CUDA_CHECK(hipHostFree(ptr));
}
const hipDeviceProp_t& CudaDevice::properties() const { return properties_; }
const void* CudaDevice::GetConstZeros(DataType data_type, size_t n) const {
if (GetSizeOfDataType(data_type) * n
<= GetSizeOfDataType(DataType::kFloat) * const_buf_elem_cnt_) {
return const_zeros_buffer_;
} else {
return nullptr;
}
}
const void* CudaDevice::GetConstOnes(DataType data_type, size_t n) const {
if (n <= const_buf_elem_cnt_) {
if (data_type == DataType::kFloat) {
return const_ones_buffer_fp32_;
} else if (data_type == DataType::kFloat16) {
return const_ones_buffer_fp16_;
} else if (data_type == DataType::kBFloat16) {
return const_ones_buffer_bf16_;
} else {
return nullptr;
}
} else {
return nullptr;
}
}
} // namespace ep
} // namespace oneflow
#endif // WITH_ROCM
/*
Copyright 2020 The OneFlow Authors. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
#ifndef ONEFLOW_CORE_EP_ROCM_CUDA_DEVICE_H_
#define ONEFLOW_CORE_EP_ROCM_CUDA_DEVICE_H_
#include "oneflow/core/ep/include/device.h"
#include "oneflow/core/common/data_type.h"
#ifdef WITH_ROCM
#include <hip/hip_runtime.h>
namespace oneflow {
namespace ep {
class CudaDevice : public Device {
public:
OF_DISALLOW_COPY_AND_MOVE(CudaDevice);
explicit CudaDevice(int device_index, DeviceManager* device_manager);
~CudaDevice() override;
void SetAsActiveDevice() override;
DeviceType device_type() const override { return DeviceType::kCUDA; }
size_t device_index() const override { return device_index_; }
DeviceManager* device_manager() const override { return device_manager_; }
Stream* CreateStream() override;
void DestroyStream(Stream* stream) override;
void CreateEvents(Event** events, size_t count) override;
void DestroyEvents(Event** events, size_t count) override;
Maybe<void> Alloc(const AllocationOptions& options, void** ptr, size_t size) override;
void Free(const AllocationOptions& options, void* ptr) override;
Maybe<void> AllocPinned(const AllocationOptions& options, void** ptr, size_t size) override;
void FreePinned(const AllocationOptions& options, void* ptr) override;
const hipDeviceProp_t& properties() const;
const void* GetConstZeros(DataType data_type, size_t n) const;
const void* GetConstOnes(DataType data_type, size_t n) const;
private:
int device_index_;
std::mutex events_mutex_;
std::vector<Event*> events_;
unsigned int event_flags_;
hipDeviceProp_t properties_;
DeviceManager* device_manager_;
int64_t const_buf_elem_cnt_;
void* const_zeros_buffer_;
void* const_ones_buffer_fp32_;
void* const_ones_buffer_fp16_;
void* const_ones_buffer_bf16_;
};
} // namespace ep
} // namespace oneflow
#endif // WITH_ROCM
#endif // ONEFLOW_CORE_EP_ROCM_CUDA_DEVICE_H_
/*
Copyright 2020 The OneFlow Authors. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
#include "oneflow/core/ep/rocm/cuda_device_manager.h"
#include "oneflow/core/device/cuda_util.h"
#ifdef WITH_ROCM
namespace oneflow {
namespace ep {
CudaDeviceManager::CudaDeviceManager(DeviceManagerRegistry* registry) : registry_(registry) {}
CudaDeviceManager::~CudaDeviceManager() = default;
DeviceManagerRegistry* CudaDeviceManager::registry() const { return registry_; }
std::shared_ptr<Device> CudaDeviceManager::GetDevice(size_t device_index) {
std::lock_guard<std::mutex> lock(devices_mutex_);
if (device_index < devices_.size() && devices_.at(device_index)) {
return devices_.at(device_index);
}
auto device = std::make_shared<CudaDevice>(device_index, this);
if (device_index >= devices_.size()) { devices_.resize(device_index + 1); }
devices_.at(device_index) = device;
return device;
}
size_t CudaDeviceManager::GetDeviceCount(size_t primary_device_index) {
CudaCurrentDeviceGuard guard(primary_device_index);
return this->GetDeviceCount();
}
size_t CudaDeviceManager::GetDeviceCount() {
int count = 0;
hipError_t err = hipGetDeviceCount(&count);
if (err == hipErrorNoDevice || err == hipErrorInsufficientDriver) { return 0; }
OF_CUDA_CHECK(err);
return count;
}
size_t CudaDeviceManager::GetActiveDeviceIndex() {
int device = 0;
OF_CUDA_CHECK(hipGetDevice(&device));
return static_cast<size_t>(device);
}
void CudaDeviceManager::SetActiveDeviceByIndex(size_t device_index) {
OF_CUDA_CHECK(hipSetDevice(static_cast<int>(device_index)));
}
} // namespace ep
} // namespace oneflow
#endif // WITH_ROCM
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment