Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
Oneflow
Commits
a715222c
Commit
a715222c
authored
Feb 28, 2023
by
yuguo
Browse files
0.9.1-rocm
parent
f262efc9
Changes
469
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
837 additions
and
364 deletions
+837
-364
oneflow/core/ep/cuda/primitive/memcpy.cpp
oneflow/core/ep/cuda/primitive/memcpy.cpp
+48
-0
oneflow/core/ep/cuda/primitive/memset.cpp
oneflow/core/ep/cuda/primitive/memset.cpp
+46
-0
oneflow/core/ep/cuda/primitive/permute.cu
oneflow/core/ep/cuda/primitive/permute.cu
+8
-2
oneflow/core/ep/cuda/primitive/softmax.cu
oneflow/core/ep/cuda/primitive/softmax.cu
+2
-2
oneflow/core/ep/cuda/primitive/softmax_backward.cu
oneflow/core/ep/cuda/primitive/softmax_backward.cu
+2
-2
oneflow/core/ep/cuda/primitive/tensor_fill.cu
oneflow/core/ep/cuda/primitive/tensor_fill.cu
+135
-0
oneflow/core/ep/cuda/primitive/type_seq.h
oneflow/core/ep/cuda/primitive/type_seq.h
+68
-0
oneflow/core/ep/cuda/primitive/unary_functor.cuh
oneflow/core/ep/cuda/primitive/unary_functor.cuh
+278
-30
oneflow/core/ep/include/device.h
oneflow/core/ep/include/device.h
+1
-0
oneflow/core/ep/include/event.h
oneflow/core/ep/include/event.h
+1
-0
oneflow/core/ep/include/gpu_macro.h
oneflow/core/ep/include/gpu_macro.h
+49
-0
oneflow/core/ep/include/primitive/binary_op.h
oneflow/core/ep/include/primitive/binary_op.h
+37
-1
oneflow/core/ep/include/primitive/broadcast_elementwise_unary.h
...w/core/ep/include/primitive/broadcast_elementwise_unary.h
+66
-0
oneflow/core/ep/include/primitive/fast_integer_math.h
oneflow/core/ep/include/primitive/fast_integer_math.h
+3
-1
oneflow/core/ep/include/primitive/tensor_fill.h
oneflow/core/ep/include/primitive/tensor_fill.h
+49
-0
oneflow/core/ep/include/primitive/unary_op.h
oneflow/core/ep/include/primitive/unary_op.h
+42
-1
oneflow/core/ep/include/stream.h
oneflow/core/ep/include/stream.h
+2
-0
oneflow/core/ep/rocm/cuda_device.cpp
oneflow/core/ep/rocm/cuda_device.cpp
+0
-179
oneflow/core/ep/rocm/cuda_device.h
oneflow/core/ep/rocm/cuda_device.h
+0
-78
oneflow/core/ep/rocm/cuda_device_manager.cpp
oneflow/core/ep/rocm/cuda_device_manager.cpp
+0
-68
No files found.
Too many changes to show.
To preserve performance only
469 of 469+
files are displayed.
Plain diff
Email patch
oneflow/core/ep/cuda/primitive/memcpy.cpp
View file @
a715222c
...
...
@@ -60,3 +60,51 @@ REGISTER_PRIMITIVE_FACTORY(DeviceType::kCUDA, MemcpyFactory, MemcpyFactoryImpl);
}
// namespace oneflow
#endif
#ifdef WITH_ROCM
#include "oneflow/core/ep/include/primitive/memcpy.h"
#include "oneflow/core/ep/cuda/cuda_stream.h"
#include <hip/hip_runtime.h>
namespace
oneflow
{
namespace
ep
{
namespace
primitive
{
namespace
{
class
MemcpyImpl
:
public
Memcpy
{
public:
OF_DISALLOW_COPY_AND_MOVE
(
MemcpyImpl
);
MemcpyImpl
()
=
default
;
~
MemcpyImpl
()
override
=
default
;
void
Launch
(
Stream
*
stream
,
void
*
dst
,
const
void
*
src
,
size_t
count
)
override
{
if
(
dst
==
src
)
{
return
;
}
auto
*
cuda_stream
=
stream
->
As
<
CudaStream
>
();
OF_CUDA_CHECK
(
hipMemcpyAsync
(
dst
,
src
,
count
,
hipMemcpyDefault
,
cuda_stream
->
cuda_stream
()));
}
};
class
MemcpyFactoryImpl
:
public
MemcpyFactory
{
public:
OF_DISALLOW_COPY_AND_MOVE
(
MemcpyFactoryImpl
);
MemcpyFactoryImpl
()
=
default
;
~
MemcpyFactoryImpl
()
override
=
default
;
std
::
unique_ptr
<
Memcpy
>
New
(
MemcpyKind
kind
)
override
{
return
std
::
unique_ptr
<
Memcpy
>
(
new
MemcpyImpl
());
}
};
REGISTER_PRIMITIVE_FACTORY
(
DeviceType
::
kCUDA
,
MemcpyFactory
,
MemcpyFactoryImpl
);
}
// namespace
}
// namespace primitive
}
// namespace ep
}
// namespace oneflow
#endif
oneflow/core/ep/cuda/primitive/memset.cpp
View file @
a715222c
...
...
@@ -57,3 +57,49 @@ REGISTER_PRIMITIVE_FACTORY(DeviceType::kCUDA, MemsetFactory, MemsetFactoryImpl);
}
// namespace oneflow
#endif
#ifdef WITH_ROCM
#include "oneflow/core/ep/include/primitive/memset.h"
#include "oneflow/core/ep/cuda/cuda_stream.h"
#include <hip/hip_runtime.h>
namespace
oneflow
{
namespace
ep
{
namespace
primitive
{
namespace
{
class
MemsetImpl
:
public
Memset
{
public:
OF_DISALLOW_COPY_AND_MOVE
(
MemsetImpl
);
MemsetImpl
()
=
default
;
~
MemsetImpl
()
override
=
default
;
void
Launch
(
Stream
*
stream
,
void
*
ptr
,
int
value
,
size_t
count
)
override
{
auto
*
cuda_stream
=
stream
->
As
<
CudaStream
>
();
OF_CUDA_CHECK
(
hipMemsetAsync
(
ptr
,
value
,
count
,
cuda_stream
->
cuda_stream
()));
}
};
class
MemsetFactoryImpl
:
public
MemsetFactory
{
public:
OF_DISALLOW_COPY_AND_MOVE
(
MemsetFactoryImpl
);
MemsetFactoryImpl
()
=
default
;
~
MemsetFactoryImpl
()
override
=
default
;
std
::
unique_ptr
<
Memset
>
New
()
override
{
return
std
::
unique_ptr
<
Memset
>
(
new
MemsetImpl
());
}
};
REGISTER_PRIMITIVE_FACTORY
(
DeviceType
::
kCUDA
,
MemsetFactory
,
MemsetFactoryImpl
);
}
// namespace
}
// namespace primitive
}
// namespace ep
}
// namespace oneflow
#endif
oneflow/core/ep/cuda/primitive/permute.cu
View file @
a715222c
...
...
@@ -16,7 +16,11 @@ limitations under the License.
#include "oneflow/core/ep/include/primitive/permute.h"
#include "oneflow/core/ep/common/primitive/permute_impl.h"
#include "oneflow/core/ep/cuda/cuda_stream.h"
#ifdef WITH_ROCM
#include <hip/hip_runtime.h>
#else
#include <cuda_runtime.h>
#endif
namespace
oneflow
{
...
...
@@ -192,7 +196,7 @@ __global__ void BatchTransposeMovement2Kernel(const void* src_ptr, void* dst_ptr
}
template
<
size_t
num_dims
,
size_t
movement_size
,
size_t
tile_size
,
typename
IndexType
>
void
LaunchBatchTransposeKernel
(
cuda
Stream_t
&
cuda_stream
,
void
LaunchBatchTransposeKernel
(
GPU
(
Stream_t
)
&
cuda_stream
,
const
PermuteKernelParams
<
num_dims
,
IndexType
>&
params
,
const
IndexType
&
num_batches
,
const
IndexType
&
rows
,
const
IndexType
&
cols
)
{
...
...
@@ -264,7 +268,7 @@ void LaunchKernel(Stream* stream, const int64_t* src_dims, const void* src, cons
void
*
dst
,
size_t
count
)
{
PermuteKernelParams
<
num_dims
,
IndexType
>
params
=
MakePermuteParams
<
num_dims
,
IndexType
>
(
src_dims
,
src
,
permutation
,
dst
,
count
);
cuda
Stream_t
cuda_stream
=
stream
->
As
<
CudaStream
>
()
->
cuda_stream
();
GPU
(
Stream_t
)
cuda_stream
=
stream
->
As
<
CudaStream
>
()
->
cuda_stream
();
if
(
num_dims
==
2
||
num_dims
==
3
)
{
IndexType
num_batches
;
...
...
@@ -281,10 +285,12 @@ void LaunchKernel(Stream* stream, const int64_t* src_dims, const void* src, cons
cuda_stream
,
params
,
num_batches
,
rows
,
cols
);
}
}
else
{
if
(
params
.
count
==
0
)
{
return
;
}
PermuteKernel
<
num_dims
,
movement_size
,
IndexType
>
<<<
BlocksNum4ThreadsNum
(
params
.
count
),
kCudaThreadsNumPerBlock
,
0
,
cuda_stream
>>>
(
params
);
}
}
else
{
if
(
params
.
count
==
0
)
{
return
;
}
PermuteKernel
<
num_dims
,
movement_size
,
IndexType
>
<<<
BlocksNum4ThreadsNum
(
params
.
count
),
kCudaThreadsNumPerBlock
,
0
,
cuda_stream
>>>
(
params
);
}
...
...
oneflow/core/ep/cuda/primitive/softmax.cu
View file @
a715222c
...
...
@@ -32,7 +32,7 @@ enum class Algorithm {
};
template
<
Algorithm
algorithm
,
typename
T
>
void
SoftmaxGpu
(
cuda
Stream_t
cuda_stream
,
size_t
rows
,
size_t
cols
,
const
T
*
x
,
T
*
y
)
{
void
SoftmaxGpu
(
GPU
(
Stream_t
)
cuda_stream
,
size_t
rows
,
size_t
cols
,
const
T
*
x
,
T
*
y
)
{
using
ComputeType
=
typename
cuda
::
softmax
::
DefaultComputeType
<
T
>::
type
;
oneflow
::
cuda
::
softmax
::
DirectLoad
<
T
,
ComputeType
>
load
(
x
,
cols
);
oneflow
::
cuda
::
softmax
::
DirectStore
<
ComputeType
,
T
>
store
(
y
,
cols
);
...
...
@@ -55,7 +55,7 @@ class SoftmaxImpl : public SoftmaxBase {
~
SoftmaxImpl
()
override
=
default
;
void
Launch
(
Stream
*
stream
,
size_t
rows
,
size_t
cols
,
const
void
*
x
,
void
*
y
)
override
{
cuda
Stream_t
cuda_stream
=
stream
->
As
<
CudaStream
>
()
->
cuda_stream
();
GPU
(
Stream_t
)
cuda_stream
=
stream
->
As
<
CudaStream
>
()
->
cuda_stream
();
SoftmaxGpu
<
algorithm
,
T
>
(
cuda_stream
,
rows
,
cols
,
reinterpret_cast
<
const
T
*>
(
x
),
reinterpret_cast
<
T
*>
(
y
));
}
...
...
oneflow/core/ep/cuda/primitive/softmax_backward.cu
View file @
a715222c
...
...
@@ -32,7 +32,7 @@ enum class Algorithm {
};
template
<
Algorithm
algorithm
,
typename
T
>
void
SoftmaxBackwardGpu
(
cuda
Stream_t
cuda_stream
,
size_t
rows
,
size_t
cols
,
const
T
*
y
,
const
T
*
dy
,
void
SoftmaxBackwardGpu
(
GPU
(
Stream_t
)
cuda_stream
,
size_t
rows
,
size_t
cols
,
const
T
*
y
,
const
T
*
dy
,
T
*
dx
)
{
using
ComputeType
=
typename
cuda
::
softmax
::
DefaultComputeType
<
T
>::
type
;
cuda
::
softmax
::
DirectLoad
<
T
,
ComputeType
>
load_y
(
y
,
cols
);
...
...
@@ -60,7 +60,7 @@ class SoftmaxBackwardImpl : public SoftmaxBackwardBase {
void
Launch
(
Stream
*
stream
,
size_t
rows
,
size_t
cols
,
const
void
*
y
,
const
void
*
dy
,
void
*
dx
)
override
{
cuda
Stream_t
cuda_stream
=
stream
->
As
<
CudaStream
>
()
->
cuda_stream
();
GPU
(
Stream_t
)
cuda_stream
=
stream
->
As
<
CudaStream
>
()
->
cuda_stream
();
SoftmaxBackwardGpu
<
algorithm
,
T
>
(
cuda_stream
,
rows
,
cols
,
reinterpret_cast
<
const
T
*>
(
y
),
reinterpret_cast
<
const
T
*>
(
dy
),
reinterpret_cast
<
T
*>
(
dx
));
}
...
...
oneflow/core/ep/cuda/primitive/tensor_fill.cu
0 → 100644
View file @
a715222c
/*
Copyright 2020 The OneFlow Authors. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
#include "oneflow/core/ep/include/primitive/tensor_fill.h"
#include "oneflow/core/ep/cuda/primitive/type_seq.h"
#include "oneflow/core/ep/cuda/cuda_stream.h"
namespace
oneflow
{
namespace
ep
{
namespace
primitive
{
namespace
{
template
<
size_t
size
>
using
Storage
=
typename
std
::
aligned_storage
<
size
,
size
>::
type
;
template
<
typename
T
,
size_t
pack
>
union
Pack
{
static
constexpr
size_t
size
=
sizeof
(
T
)
*
pack
;
explicit
__device__
__host__
Pack
(
const
T
value
)
{
static_assert
(
sizeof
(
Pack
)
==
size
,
""
);
static_assert
(
alignof
(
Pack
)
==
size
,
""
);
#pragma unroll
for
(
size_t
i
=
0
;
i
<
pack
;
++
i
)
{
elem
[
i
]
=
value
;
}
}
T
elem
[
pack
];
Storage
<
size
>
storage
;
};
template
<
typename
T
,
size_t
pack
>
__global__
void
TensorFillGpu
(
T
*
dst
,
const
T
*
value
,
size_t
count
)
{
const
size_t
pack_count
=
count
/
pack
;
const
T
fill_value
=
value
[
0
];
Pack
<
T
,
pack
>
pack_value
(
fill_value
);
auto
*
pack_dst
=
reinterpret_cast
<
decltype
(
pack_value
.
storage
)
*>
(
dst
);
CUDA_1D_KERNEL_LOOP_T
(
size_t
,
i
,
pack_count
)
{
pack_dst
[
i
]
=
pack_value
.
storage
;
}
T
*
tail_dst
=
dst
+
pack_count
*
pack
;
const
size_t
tail_count
=
count
-
pack_count
*
pack
;
CUDA_1D_KERNEL_LOOP_T
(
size_t
,
i
,
tail_count
)
{
tail_dst
[
i
]
=
fill_value
;
}
}
template
<
typename
T
,
size_t
pack
>
typename
std
::
enable_if
<
(
pack
!=
0
),
void
>::
type
LaunchPackTensorFill
(
GPU
(
Stream_t
)
stream
,
T
*
dst
,
const
T
*
value
,
size_t
count
)
{
TensorFillGpu
<
T
,
pack
>
<<<
BlocksNum4ThreadsNum
(
count
),
kCudaThreadsNumPerBlock
,
0
,
stream
>>>
(
dst
,
value
,
count
);
}
template
<
typename
T
,
size_t
pack
>
typename
std
::
enable_if
<
(
pack
==
0
),
void
>::
type
LaunchPackTensorFill
(
GPU
(
Stream_t
)
stream
,
T
*
dst
,
const
T
*
value
,
size_t
count
)
{
LOG
(
FATAL
)
<<
"wrong alignment"
;
}
template
<
typename
T
>
void
LaunchTensorFill
(
GPU
(
Stream_t
)
stream
,
T
*
dst
,
const
T
*
value
,
size_t
count
)
{
auto
uintptr
=
reinterpret_cast
<
std
::
uintptr_t
>
(
dst
);
if
(
uintptr
%
16
==
0
)
{
LaunchPackTensorFill
<
T
,
16
/
sizeof
(
T
)
>
(
stream
,
dst
,
value
,
count
);
}
else
if
(
uintptr
%
8
==
0
)
{
LaunchPackTensorFill
<
T
,
8
/
sizeof
(
T
)
>
(
stream
,
dst
,
value
,
count
);
}
else
if
(
uintptr
%
4
==
0
)
{
LaunchPackTensorFill
<
T
,
4
/
sizeof
(
T
)
>
(
stream
,
dst
,
value
,
count
);
}
else
if
(
uintptr
%
2
==
0
)
{
LaunchPackTensorFill
<
T
,
2
/
sizeof
(
T
)
>
(
stream
,
dst
,
value
,
count
);
}
else
{
LaunchPackTensorFill
<
T
,
1
/
sizeof
(
T
)
>
(
stream
,
dst
,
value
,
count
);
}
}
template
<
typename
T
>
class
TensorFillImpl
:
public
TensorFill
{
public:
OF_DISALLOW_COPY_AND_MOVE
(
TensorFillImpl
);
TensorFillImpl
()
=
default
;
~
TensorFillImpl
()
override
=
default
;
void
Launch
(
Stream
*
stream
,
const
void
*
src
,
void
*
dst
,
size_t
count
)
override
{
GPU
(
Stream_t
)
cuda_stream
=
stream
->
As
<
CudaStream
>
()
->
cuda_stream
();
const
T
*
value
=
reinterpret_cast
<
const
T
*>
(
src
);
LaunchTensorFill
<
T
>
(
cuda_stream
,
reinterpret_cast
<
T
*>
(
dst
),
value
,
count
);
}
};
template
<
typename
T
>
std
::
unique_ptr
<
TensorFill
>
NewTensorFill
()
{
return
std
::
unique_ptr
<
TensorFill
>
(
new
TensorFillImpl
<
T
>
());
}
class
TensorFillFactoryImpl
:
public
TensorFillFactory
{
public:
OF_DISALLOW_COPY_AND_MOVE
(
TensorFillFactoryImpl
);
TensorFillFactoryImpl
()
=
default
;
~
TensorFillFactoryImpl
()
override
=
default
;
std
::
unique_ptr
<
TensorFill
>
New
(
DataType
data_type
)
override
{
#define MAKE_NEW_TENSOR_FILL_ENTRY(type_cpp, type_proto) {type_proto, NewTensorFill<type_cpp>},
static
const
std
::
map
<
DataType
,
std
::
function
<
std
::
unique_ptr
<
TensorFill
>
()
>>
new_fill_handle
{
OF_PP_FOR_EACH_TUPLE
(
MAKE_NEW_TENSOR_FILL_ENTRY
,
CUDA_PRIMITIVE_ALL_TYPE_SEQ
)};
#undef MAKE_NEW_TENSOR_FILL_ENTRY
const
auto
it
=
new_fill_handle
.
find
(
data_type
);
if
(
it
!=
new_fill_handle
.
end
())
{
return
it
->
second
();
}
else
{
return
nullptr
;
}
}
};
REGISTER_PRIMITIVE_FACTORY
(
DeviceType
::
kCUDA
,
TensorFillFactory
,
TensorFillFactoryImpl
);
}
// namespace
}
// namespace primitive
}
// namespace ep
}
// namespace oneflow
oneflow/core/ep/cuda/primitive/type_seq.h
View file @
a715222c
...
...
@@ -63,6 +63,12 @@ limitations under the License.
CUDA_PRIMITIVE_FLOAT16_TYPE_SEQ \
CUDA_PRIMITIVE_BFLOAT16_TYPE_SEQ
#define CUDA_PRIMITIVE_INT_TYPE_SEQ \
CUDA_PRIMITIVE_UINT8_TYPE_SEQ \
CUDA_PRIMITIVE_INT8_TYPE_SEQ \
CUDA_PRIMITIVE_INT32_TYPE_SEQ \
CUDA_PRIMITIVE_INT64_TYPE_SEQ
#define UTIL_OPS_DATA_TYPE_SEQ \
CUDA_PRIMITIVE_INT8_TYPE_SEQ \
CUDA_PRIMITIVE_UINT8_TYPE_SEQ \
...
...
@@ -75,4 +81,66 @@ limitations under the License.
#endif // WITH_CUDA
#ifdef WITH_ROCM
#include <hip/hip_runtime.h>
#include <hip/hip_fp16.h>
// #if CUDA_VERSION >= 11000
// #include <cuda_bf16.h>
// #endif // CUDA_VERSION >= 11000
#define CUDA_PRIMITIVE_BOOL_TYPE_SEQ OF_PP_MAKE_TUPLE_SEQ(bool, DataType::kBool)
#define CUDA_PRIMITIVE_CHAR_TYPE_SEQ OF_PP_MAKE_TUPLE_SEQ(char, DataType::kChar)
#define CUDA_PRIMITIVE_INT8_TYPE_SEQ OF_PP_MAKE_TUPLE_SEQ(int8_t, DataType::kInt8)
#define CUDA_PRIMITIVE_UINT8_TYPE_SEQ OF_PP_MAKE_TUPLE_SEQ(uint8_t, DataType::kUInt8)
#define CUDA_PRIMITIVE_INT32_TYPE_SEQ OF_PP_MAKE_TUPLE_SEQ(int32_t, DataType::kInt32)
#define CUDA_PRIMITIVE_UINT32_TYPE_SEQ OF_PP_MAKE_TUPLE_SEQ(uint32_t, DataType::kUInt32)
#define CUDA_PRIMITIVE_INT64_TYPE_SEQ OF_PP_MAKE_TUPLE_SEQ(int64_t, DataType::kInt64)
#define CUDA_PRIMITIVE_UINT64_TYPE_SEQ OF_PP_MAKE_TUPLE_SEQ(uint64_t, DataType::kUInt64)
#define CUDA_PRIMITIVE_FLOAT_TYPE_SEQ OF_PP_MAKE_TUPLE_SEQ(float, DataType::kFloat)
#define CUDA_PRIMITIVE_DOUBLE_TYPE_SEQ OF_PP_MAKE_TUPLE_SEQ(double, DataType::kDouble)
#define CUDA_PRIMITIVE_FLOAT16_TYPE_SEQ OF_PP_MAKE_TUPLE_SEQ(half, DataType::kFloat16)
// #if CUDA_VERSION >= 11000
// #define CUDA_PRIMITIVE_BFLOAT16_TYPE_SEQ OF_PP_MAKE_TUPLE_SEQ(nv_bfloat16, DataType::kBFloat16)
// #else
#define CUDA_PRIMITIVE_BFLOAT16_TYPE_SEQ
// #endif // CUDA_VERSION >= 11000
#define CUDA_PRIMITIVE_ALL_TYPE_SEQ \
CUDA_PRIMITIVE_BOOL_TYPE_SEQ \
CUDA_PRIMITIVE_CHAR_TYPE_SEQ \
CUDA_PRIMITIVE_INT8_TYPE_SEQ \
CUDA_PRIMITIVE_UINT8_TYPE_SEQ \
CUDA_PRIMITIVE_INT32_TYPE_SEQ \
CUDA_PRIMITIVE_INT64_TYPE_SEQ \
CUDA_PRIMITIVE_FLOAT_TYPE_SEQ \
CUDA_PRIMITIVE_DOUBLE_TYPE_SEQ \
CUDA_PRIMITIVE_FLOAT16_TYPE_SEQ \
CUDA_PRIMITIVE_BFLOAT16_TYPE_SEQ
#define CUDA_PRIMITIVE_FLOATING_TYPE_SEQ \
CUDA_PRIMITIVE_FLOAT_TYPE_SEQ \
CUDA_PRIMITIVE_DOUBLE_TYPE_SEQ \
CUDA_PRIMITIVE_FLOAT16_TYPE_SEQ \
CUDA_PRIMITIVE_BFLOAT16_TYPE_SEQ
#define CUDA_PRIMITIVE_INT_TYPE_SEQ \
CUDA_PRIMITIVE_UINT8_TYPE_SEQ \
CUDA_PRIMITIVE_INT8_TYPE_SEQ \
CUDA_PRIMITIVE_INT32_TYPE_SEQ \
CUDA_PRIMITIVE_INT64_TYPE_SEQ
#define UTIL_OPS_DATA_TYPE_SEQ \
CUDA_PRIMITIVE_INT8_TYPE_SEQ \
CUDA_PRIMITIVE_UINT8_TYPE_SEQ \
CUDA_PRIMITIVE_INT32_TYPE_SEQ \
CUDA_PRIMITIVE_INT64_TYPE_SEQ \
CUDA_PRIMITIVE_FLOAT_TYPE_SEQ \
CUDA_PRIMITIVE_DOUBLE_TYPE_SEQ \
CUDA_PRIMITIVE_FLOAT16_TYPE_SEQ \
CUDA_PRIMITIVE_BFLOAT16_TYPE_SEQ
#endif // WITH_ROCM
#endif // ONEFLOW_CORE_EP_CUDA_PRIMITIVE_TYPE_SEQ_H_
oneflow/core/ep/cuda/primitive/unary_functor.cuh
View file @
a715222c
...
...
@@ -17,14 +17,19 @@ limitations under the License.
#include "oneflow/core/ep/cuda/primitive/type_seq.h"
#include "oneflow/core/cuda/elementwise.cuh"
#include "oneflow/core/ep/cuda/cuda_stream.h"
#ifdef WITH_ROCM
#include <hip/hip_runtime.h>
#include <hip/hip_fp16.h>
#else
#include <cuda.h>
#endif
namespace
oneflow
{
namespace
ep
{
namespace
primitive
{
template
<
typename
Dst
,
typename
Src
>
struct
UnaryFunctor
<
DeviceType
::
kCUDA
,
UnaryOp
::
kGelu
,
Dst
,
Src
>
{
UnaryFunctor
(
Scalar
attr0
,
Scalar
attr1
)
{}
OF_DEVICE_FUNC
UnaryFunctor
(
Scalar
attr0
,
Scalar
attr1
)
{}
OF_DEVICE_FUNC
Dst
operator
()(
Src
src
)
const
{
return
static_cast
<
Src
>
(
0.5
)
*
src
...
...
@@ -32,78 +37,236 @@ struct UnaryFunctor<DeviceType::kCUDA, UnaryOp::kGelu, Dst, Src> {
}
};
template
<
typename
Dst
,
typename
Src
>
struct
UnaryFunctor
<
DeviceType
::
kCUDA
,
UnaryOp
::
kFastGelu
,
Dst
,
Src
>
{
OF_DEVICE_FUNC
UnaryFunctor
(
Scalar
attr0
,
Scalar
attr1
)
{}
OF_DEVICE_FUNC
Dst
operator
()(
Src
src
)
const
{
// ref to: https://mlfromscratch.com/activation-functions-explained/#gelu
const
Src
half
=
static_cast
<
Src
>
(
0.5
);
const
Src
one
=
static_cast
<
Src
>
(
1
);
const
Src
tanh_in
=
alpha
*
(
src
+
beta
*
src
*
src
*
src
);
return
half
*
src
*
(
one
+
tanh
(
tanh_in
));
}
private:
// constant ref to:
// https://github.com/microsoft/onnxruntime/blob/main/onnxruntime/test/testdata/transform/fusion/fast_gelu.py
static
constexpr
Src
alpha
=
static_cast
<
Src
>
(
0.7978845608028654
);
static
constexpr
Src
beta
=
static_cast
<
Src
>
(
0.044714998453855515
);
};
template
<
typename
Dst
,
typename
Src
>
struct
UnaryFunctor
<
DeviceType
::
kCUDA
,
UnaryOp
::
kQuickGelu
,
Dst
,
Src
>
{
OF_DEVICE_FUNC
UnaryFunctor
(
Scalar
attr0
,
Scalar
attr1
)
{}
OF_DEVICE_FUNC
Dst
operator
()(
Src
src
)
const
{
const
Src
sigmoid
=
static_cast
<
Dst
>
(
static_cast
<
Src
>
(
1.0
)
/
(
static_cast
<
Src
>
(
1.0
)
+
exp
(
-
src
*
alpha
)));
return
src
*
sigmoid
;
}
private:
static
constexpr
Src
alpha
=
static_cast
<
Src
>
(
1.702
);
};
namespace
unary_functor_internal
{
namespace
{
OF_DEVICE_FUNC
float
TanhApprox
(
float
x
)
{
#if (__CUDA_ARCH__ >= 750 && CUDA_VERSION >= 11000)
float
r
;
asm
(
"tanh.approx.f32 %0,%1;
\n\t
"
:
"=f"
(
r
)
:
"f"
(
x
));
return
r
;
#else
return
tanhf
(
x
);
#endif // (__CUDA_ARCH__ >= 750 && CUDA_VERSION >= 11000)
}
}
// namespace
}
// namespace unary_functor_internal
template
<
>
struct
UnaryFunctor
<
DeviceType
::
kCUDA
,
UnaryOp
::
kFastGelu
,
half
,
half
>
{
OF_DEVICE_FUNC
UnaryFunctor
(
Scalar
attr0
,
Scalar
attr1
)
:
float_functor
(
attr0
,
attr1
)
{}
OF_DEVICE_FUNC
half
operator
()(
half
src
)
const
{
#if (__CUDA_ARCH__ >= 750 && CUDA_VERSION >= 11000)
const
float
tanh_in
=
__half2float
(
__float2half_rn
(
alpha
)
*
(
src
+
__float2half_rn
(
beta
)
*
src
*
src
*
src
));
const
float
tanh_out
=
unary_functor_internal
::
TanhApprox
(
tanh_in
);
return
__float2half_rn
(
0.5
F
)
*
src
*
(
__float2half_rn
(
1.0
F
)
+
__float2half_rn
(
tanh_out
));
#else
return
static_cast
<
half
>
(
float_functor
(
static_cast
<
float
>
(
src
)));
#endif // (__CUDA_ARCH__ >= 750 && CUDA_VERSION >= 11000)
}
#if (__CUDA_ARCH__ >= 750 && CUDA_VERSION >= 11000)
__device__
void
Apply2
(
half
*
dst
,
const
half
*
src
)
const
{
const
half2
src2
=
*
(
reinterpret_cast
<
const
half2
*>
(
src
));
const
float2
tanh_in
=
__half22float2
(
__hmul2
(
__float2half2_rn
(
alpha
),
__hadd2
(
src2
,
__hmul2
(
__hmul2
(
__hmul2
(
__float2half2_rn
(
beta
),
src2
),
src2
),
src2
))));
float2
tanh_out
;
tanh_out
.
x
=
unary_functor_internal
::
TanhApprox
(
tanh_in
.
x
);
tanh_out
.
y
=
unary_functor_internal
::
TanhApprox
(
tanh_in
.
y
);
const
half2
dst2
=
__hmul2
(
__hmul2
(
__float2half2_rn
(
0.5
F
),
src2
),
__hadd2
(
__float2half2_rn
(
1.0
F
),
__float22half2_rn
(
tanh_out
)));
*
reinterpret_cast
<
half2
*>
(
dst
)
=
dst2
;
}
#endif // (__CUDA_ARCH__ >= 750 && CUDA_VERSION >= 11000)
private:
static
constexpr
float
alpha
=
0.7978845608028654
F
;
static
constexpr
float
beta
=
0.044714998453855515
F
;
UnaryFunctor
<
DeviceType
::
kCUDA
,
UnaryOp
::
kFastGelu
,
float
,
float
>
float_functor
;
};
template
<
>
struct
UnaryFunctor
<
DeviceType
::
kCUDA
,
UnaryOp
::
kTanh
,
float
,
float
>
{
UnaryFunctor
(
Scalar
attr0
,
Scalar
attr1
)
{}
OF_DEVICE_FUNC
UnaryFunctor
(
Scalar
attr0
,
Scalar
attr1
)
{}
OF_DEVICE_FUNC
float
operator
()(
float
src
)
const
{
return
tanhf
(
src
);
}
};
template
<
>
struct
UnaryFunctor
<
DeviceType
::
kCUDA
,
UnaryOp
::
kTanh
,
double
,
double
>
{
UnaryFunctor
(
Scalar
attr0
,
Scalar
attr1
)
{}
OF_DEVICE_FUNC
UnaryFunctor
(
Scalar
attr0
,
Scalar
attr1
)
{}
OF_DEVICE_FUNC
double
operator
()(
double
src
)
const
{
return
tanh
(
src
);
}
};
template
<
>
struct
UnaryFunctor
<
DeviceType
::
kCUDA
,
UnaryOp
::
kTanh
,
half
,
half
>
{
UnaryFunctor
(
Scalar
attr0
,
Scalar
attr1
)
{}
OF_DEVICE_FUNC
UnaryFunctor
(
Scalar
attr0
,
Scalar
attr1
)
{}
OF_DEVICE_FUNC
half
operator
()(
half
src
)
const
{
return
__float2half
(
tanhf
(
__half2float
(
src
)));
}
};
template
<
>
struct
UnaryFunctor
<
DeviceType
::
kCUDA
,
UnaryOp
::
kIsInf
,
bool
,
half
>
{
UnaryFunctor
(
Scalar
attr0
,
Scalar
attr1
)
{}
OF_DEVICE_FUNC
UnaryFunctor
(
Scalar
attr0
,
Scalar
attr1
)
{}
OF_DEVICE_FUNC
bool
operator
()(
half
src
)
const
{
return
isinf
(
__half2float
(
src
));
}
};
template
<
>
struct
UnaryFunctor
<
DeviceType
::
kCUDA
,
UnaryOp
::
kIsInf
,
bool
,
float
>
{
UnaryFunctor
(
Scalar
attr0
,
Scalar
attr1
)
{}
OF_DEVICE_FUNC
UnaryFunctor
(
Scalar
attr0
,
Scalar
attr1
)
{}
OF_DEVICE_FUNC
bool
operator
()(
float
src
)
const
{
return
isinf
(
src
);
}
};
template
<
>
struct
UnaryFunctor
<
DeviceType
::
kCUDA
,
UnaryOp
::
kIsInf
,
bool
,
double
>
{
UnaryFunctor
(
Scalar
attr0
,
Scalar
attr1
)
{}
OF_DEVICE_FUNC
UnaryFunctor
(
Scalar
attr0
,
Scalar
attr1
)
{}
OF_DEVICE_FUNC
bool
operator
()(
double
src
)
const
{
return
isinf
(
src
);
}
};
template
<
>
struct
UnaryFunctor
<
DeviceType
::
kCUDA
,
UnaryOp
::
kIsNan
,
bool
,
half
>
{
UnaryFunctor
(
Scalar
attr0
,
Scalar
attr1
)
{}
OF_DEVICE_FUNC
UnaryFunctor
(
Scalar
attr0
,
Scalar
attr1
)
{}
OF_DEVICE_FUNC
bool
operator
()(
half
src
)
const
{
return
isnan
(
__half2float
(
src
));
}
};
template
<
>
struct
UnaryFunctor
<
DeviceType
::
kCUDA
,
UnaryOp
::
kIsNan
,
bool
,
float
>
{
UnaryFunctor
(
Scalar
attr0
,
Scalar
attr1
)
{}
OF_DEVICE_FUNC
UnaryFunctor
(
Scalar
attr0
,
Scalar
attr1
)
{}
OF_DEVICE_FUNC
bool
operator
()(
float
src
)
const
{
return
isnan
(
src
);
}
};
template
<
>
struct
UnaryFunctor
<
DeviceType
::
kCUDA
,
UnaryOp
::
kIsNan
,
bool
,
double
>
{
UnaryFunctor
(
Scalar
attr0
,
Scalar
attr1
)
{}
OF_DEVICE_FUNC
UnaryFunctor
(
Scalar
attr0
,
Scalar
attr1
)
{}
OF_DEVICE_FUNC
bool
operator
()(
double
src
)
const
{
return
isnan
(
src
);
}
};
#define SPECIALIZATION_PSEUDO_HALF_UNARY_FUNCTOR(op) \
template<> \
struct UnaryFunctor<DeviceType::kCUDA, op, half, half> { \
UnaryFunctor(Scalar attr0, Scalar attr1) : float_functor(attr0, attr1) {} \
\
UnaryFunctor<DeviceType::kCUDA, op, float, float> float_functor; \
OF_DEVICE_FUNC half operator()(half src) const { \
return __float2half(float_functor(__half2float(src))); \
} \
template
<
>
struct
UnaryFunctor
<
DeviceType
::
kCUDA
,
UnaryOp
::
kIsFinite
,
bool
,
half
>
{
OF_DEVICE_FUNC
UnaryFunctor
(
Scalar
attr0
,
Scalar
attr1
)
{}
OF_DEVICE_FUNC
bool
operator
()(
half
src
)
const
{
return
isfinite
(
__half2float
(
src
));
}
};
template
<
>
struct
UnaryFunctor
<
DeviceType
::
kCUDA
,
UnaryOp
::
kIsFinite
,
bool
,
float
>
{
OF_DEVICE_FUNC
UnaryFunctor
(
Scalar
attr0
,
Scalar
attr1
)
{}
OF_DEVICE_FUNC
bool
operator
()(
float
src
)
const
{
return
isfinite
(
src
);
}
};
template
<
>
struct
UnaryFunctor
<
DeviceType
::
kCUDA
,
UnaryOp
::
kIsFinite
,
bool
,
double
>
{
OF_DEVICE_FUNC
UnaryFunctor
(
Scalar
attr0
,
Scalar
attr1
)
{}
OF_DEVICE_FUNC
bool
operator
()(
double
src
)
const
{
return
isfinite
(
src
);
}
};
template
<
>
struct
UnaryFunctor
<
DeviceType
::
kCUDA
,
UnaryOp
::
kTrunc
,
half
,
half
>
{
OF_DEVICE_FUNC
UnaryFunctor
(
Scalar
attr0
,
Scalar
attr1
)
{}
__device__
half
operator
()(
half
src
)
const
{
return
htrunc
(
src
);
}
};
template
<
>
struct
UnaryFunctor
<
DeviceType
::
kCUDA
,
UnaryOp
::
kTrunc
,
float
,
float
>
{
OF_DEVICE_FUNC
UnaryFunctor
(
Scalar
attr0
,
Scalar
attr1
)
{}
OF_DEVICE_FUNC
float
operator
()(
float
src
)
const
{
return
truncf
(
src
);
}
};
template
<
>
struct
UnaryFunctor
<
DeviceType
::
kCUDA
,
UnaryOp
::
kTrunc
,
double
,
double
>
{
OF_DEVICE_FUNC
UnaryFunctor
(
Scalar
attr0
,
Scalar
attr1
)
{}
OF_DEVICE_FUNC
double
operator
()(
double
src
)
const
{
return
trunc
(
src
);
}
};
template
<
>
struct
UnaryFunctor
<
DeviceType
::
kCUDA
,
UnaryOp
::
kAbs
,
half
,
half
>
{
OF_DEVICE_FUNC
UnaryFunctor
(
Scalar
attr0
,
Scalar
attr1
)
{}
__device__
half
operator
()(
half
src
)
const
{
return
__hlt
(
src
,
static_cast
<
half
>
(
0
))
?
__hneg
(
src
)
:
src
;
}
};
template
<
typename
Dst
,
typename
Src
>
struct
UnaryFunctor
<
DeviceType
::
kCUDA
,
UnaryOp
::
kNanAssign
,
Dst
,
Src
>
{
OF_DEVICE_FUNC
UnaryFunctor
(
Scalar
attr0
,
Scalar
attr1
)
{}
OF_DEVICE_FUNC
Dst
operator
()(
Src
src
)
const
{
return
isnan
(
src
)
?
static_cast
<
Dst
>
(
0.0
)
:
src
;
}
};
#if CUDA_VERSION >= 11000
template
<
>
struct
UnaryFunctor
<
DeviceType
::
kCUDA
,
UnaryOp
::
kAbs
,
nv_bfloat16
,
nv_bfloat16
>
{
OF_DEVICE_FUNC
UnaryFunctor
(
Scalar
attr0
,
Scalar
attr1
)
{}
__device__
nv_bfloat16
operator
()(
nv_bfloat16
src
)
const
{
#if CUDA_ARCH >= 800
return
__habs
(
src
);
#else
return
__float2bfloat16
(
abs
(
__bfloat162float
(
src
)));
#endif // CUDA_ARCH >= 800
}
};
#endif // CUDA_VERSION >= 11000
#define SPECIALIZATION_PSEUDO_HALF_UNARY_FUNCTOR(op) \
template<> \
struct UnaryFunctor<DeviceType::kCUDA, op, half, half> { \
OF_DEVICE_FUNC UnaryFunctor(Scalar attr0, Scalar attr1) : float_functor(attr0, attr1) {} \
\
UnaryFunctor<DeviceType::kCUDA, op, float, float> float_functor; \
OF_DEVICE_FUNC half operator()(half src) const { \
return __float2half(float_functor(__half2float(src))); \
} \
};
SPECIALIZATION_PSEUDO_HALF_UNARY_FUNCTOR
(
UnaryOp
::
kElu
);
...
...
@@ -114,20 +277,53 @@ SPECIALIZATION_PSEUDO_HALF_UNARY_FUNCTOR(UnaryOp::kSelu);
SPECIALIZATION_PSEUDO_HALF_UNARY_FUNCTOR
(
UnaryOp
::
kSilu
);
SPECIALIZATION_PSEUDO_HALF_UNARY_FUNCTOR
(
UnaryOp
::
kSoftSign
);
SPECIALIZATION_PSEUDO_HALF_UNARY_FUNCTOR
(
UnaryOp
::
kSoftPlus
);
SPECIALIZATION_PSEUDO_HALF_UNARY_FUNCTOR
(
UnaryOp
::
kAcos
);
SPECIALIZATION_PSEUDO_HALF_UNARY_FUNCTOR
(
UnaryOp
::
kAcosh
);
SPECIALIZATION_PSEUDO_HALF_UNARY_FUNCTOR
(
UnaryOp
::
kAsin
);
SPECIALIZATION_PSEUDO_HALF_UNARY_FUNCTOR
(
UnaryOp
::
kAsinh
);
SPECIALIZATION_PSEUDO_HALF_UNARY_FUNCTOR
(
UnaryOp
::
kAtan
);
SPECIALIZATION_PSEUDO_HALF_UNARY_FUNCTOR
(
UnaryOp
::
kAtanh
);
SPECIALIZATION_PSEUDO_HALF_UNARY_FUNCTOR
(
UnaryOp
::
kCeil
);
SPECIALIZATION_PSEUDO_HALF_UNARY_FUNCTOR
(
UnaryOp
::
kCos
);
SPECIALIZATION_PSEUDO_HALF_UNARY_FUNCTOR
(
UnaryOp
::
kCosh
);
SPECIALIZATION_PSEUDO_HALF_UNARY_FUNCTOR
(
UnaryOp
::
kErf
);
SPECIALIZATION_PSEUDO_HALF_UNARY_FUNCTOR
(
UnaryOp
::
kErfc
);
SPECIALIZATION_PSEUDO_HALF_UNARY_FUNCTOR
(
UnaryOp
::
kExp
);
SPECIALIZATION_PSEUDO_HALF_UNARY_FUNCTOR
(
UnaryOp
::
kExpm1
);
SPECIALIZATION_PSEUDO_HALF_UNARY_FUNCTOR
(
UnaryOp
::
kFloor
);
SPECIALIZATION_PSEUDO_HALF_UNARY_FUNCTOR
(
UnaryOp
::
kLgamma
);
SPECIALIZATION_PSEUDO_HALF_UNARY_FUNCTOR
(
UnaryOp
::
kLog
);
SPECIALIZATION_PSEUDO_HALF_UNARY_FUNCTOR
(
UnaryOp
::
kLog2
);
SPECIALIZATION_PSEUDO_HALF_UNARY_FUNCTOR
(
UnaryOp
::
kLog10
);
SPECIALIZATION_PSEUDO_HALF_UNARY_FUNCTOR
(
UnaryOp
::
kLog1p
);
SPECIALIZATION_PSEUDO_HALF_UNARY_FUNCTOR
(
UnaryOp
::
kLogSigmoid
);
SPECIALIZATION_PSEUDO_HALF_UNARY_FUNCTOR
(
UnaryOp
::
kRint
);
SPECIALIZATION_PSEUDO_HALF_UNARY_FUNCTOR
(
UnaryOp
::
kRound
);
SPECIALIZATION_PSEUDO_HALF_UNARY_FUNCTOR
(
UnaryOp
::
kRsqrt
);
SPECIALIZATION_PSEUDO_HALF_UNARY_FUNCTOR
(
UnaryOp
::
kSigmoid
);
SPECIALIZATION_PSEUDO_HALF_UNARY_FUNCTOR
(
UnaryOp
::
kSin
);
SPECIALIZATION_PSEUDO_HALF_UNARY_FUNCTOR
(
UnaryOp
::
kSinh
);
SPECIALIZATION_PSEUDO_HALF_UNARY_FUNCTOR
(
UnaryOp
::
kSqrt
);
SPECIALIZATION_PSEUDO_HALF_UNARY_FUNCTOR
(
UnaryOp
::
kSquare
);
SPECIALIZATION_PSEUDO_HALF_UNARY_FUNCTOR
(
UnaryOp
::
kTan
);
SPECIALIZATION_PSEUDO_HALF_UNARY_FUNCTOR
(
UnaryOp
::
kReciprocalNoNan
);
SPECIALIZATION_PSEUDO_HALF_UNARY_FUNCTOR
(
UnaryOp
::
kNotEqualZero
);
SPECIALIZATION_PSEUDO_HALF_UNARY_FUNCTOR
(
UnaryOp
::
kNanAssign
);
SPECIALIZATION_PSEUDO_HALF_UNARY_FUNCTOR
(
UnaryOp
::
kQuickGelu
);
/*********nv_bfloat16_kernel*******/
#if CUDA_VERSION >= 11000
#define SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR(op) \
template<> \
struct UnaryFunctor<DeviceType::kCUDA, op, nv_bfloat16, nv_bfloat16> { \
UnaryFunctor(Scalar attr0, Scalar attr1) : float_functor(attr0, attr1) {} \
\
UnaryFunctor<DeviceType::kCUDA, op, float, float> float_functor; \
OF_DEVICE_FUNC nv_bfloat16 operator()(nv_bfloat16 src) const { \
return __float2bfloat16(float_functor(__bfloat162float(src))); \
} \
#define SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR(op)
\
template<>
\
struct UnaryFunctor<DeviceType::kCUDA, op, nv_bfloat16, nv_bfloat16> {
\
OF_DEVICE_FUNC
UnaryFunctor(Scalar attr0, Scalar attr1) : float_functor(attr0, attr1) {} \
\
UnaryFunctor<DeviceType::kCUDA, op, float, float> float_functor;
\
OF_DEVICE_FUNC nv_bfloat16 operator()(nv_bfloat16 src) const {
\
return __float2bfloat16(float_functor(__bfloat162float(src)));
\
}
\
};
SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR
(
UnaryOp
::
kElu
);
...
...
@@ -146,6 +342,40 @@ SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR(UnaryOp::kSoftSign);
SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR
(
UnaryOp
::
kSoftPlus
);
SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR
(
UnaryOp
::
kTanh
);
SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR
(
UnaryOp
::
kThreshold
);
SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR
(
UnaryOp
::
kAcos
);
SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR
(
UnaryOp
::
kAcosh
);
SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR
(
UnaryOp
::
kAsin
);
SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR
(
UnaryOp
::
kAsinh
);
SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR
(
UnaryOp
::
kAtan
);
SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR
(
UnaryOp
::
kAtanh
);
SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR
(
UnaryOp
::
kCeil
);
SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR
(
UnaryOp
::
kCos
);
SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR
(
UnaryOp
::
kCosh
);
SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR
(
UnaryOp
::
kErf
);
SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR
(
UnaryOp
::
kErfc
);
SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR
(
UnaryOp
::
kExp
);
SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR
(
UnaryOp
::
kExpm1
);
SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR
(
UnaryOp
::
kFloor
);
SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR
(
UnaryOp
::
kLgamma
);
SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR
(
UnaryOp
::
kLog
);
SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR
(
UnaryOp
::
kLog2
);
SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR
(
UnaryOp
::
kLog10
);
SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR
(
UnaryOp
::
kLog1p
);
SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR
(
UnaryOp
::
kLogSigmoid
);
SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR
(
UnaryOp
::
kRint
);
SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR
(
UnaryOp
::
kRound
);
SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR
(
UnaryOp
::
kRsqrt
);
SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR
(
UnaryOp
::
kSigmoid
);
SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR
(
UnaryOp
::
kSin
);
SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR
(
UnaryOp
::
kSinh
);
SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR
(
UnaryOp
::
kSqrt
);
SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR
(
UnaryOp
::
kSquare
);
SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR
(
UnaryOp
::
kTan
);
SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR
(
UnaryOp
::
kReciprocalNoNan
);
SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR
(
UnaryOp
::
kNotEqualZero
);
SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR
(
UnaryOp
::
kNanAssign
);
SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR
(
UnaryOp
::
kFastGelu
);
SPECIALIZATION_PSEUDO_BFLOAT16_UNARY_FUNCTOR
(
UnaryOp
::
kQuickGelu
);
template
<
>
struct
UnaryFunctor
<
DeviceType
::
kCUDA
,
UnaryOp
::
kIsInf
,
bool
,
nv_bfloat16
>
{
...
...
@@ -160,8 +390,26 @@ struct UnaryFunctor<DeviceType::kCUDA, UnaryOp::kIsNan, bool, nv_bfloat16> {
OF_DEVICE_FUNC
bool
operator
()(
nv_bfloat16
src
)
const
{
return
isnan
(
__bfloat162float
(
src
));
}
};
template
<
>
struct
UnaryFunctor
<
DeviceType
::
kCUDA
,
UnaryOp
::
kIsFinite
,
bool
,
nv_bfloat16
>
{
UnaryFunctor
(
Scalar
attr0
,
Scalar
attr1
)
{}
#endif
OF_DEVICE_FUNC
bool
operator
()(
nv_bfloat16
src
)
const
{
return
isfinite
(
__bfloat162float
(
src
));
}
};
template
<
>
struct
UnaryFunctor
<
DeviceType
::
kCUDA
,
UnaryOp
::
kTrunc
,
nv_bfloat16
,
nv_bfloat16
>
{
OF_DEVICE_FUNC
UnaryFunctor
(
Scalar
attr0
,
Scalar
attr1
)
{}
__device__
nv_bfloat16
operator
()(
nv_bfloat16
src
)
const
{
#if CUDA_ARCH >= 800
return
htrunc
(
src
);
#else
return
__float2bfloat16
(
truncf
(
__bfloat162float
(
src
)));
#endif // CUDA_ARCH >= 800
}
};
#endif // CUDA_VERSION >= 11000
}
// namespace primitive
}
// namespace ep
...
...
oneflow/core/ep/include/device.h
View file @
a715222c
...
...
@@ -21,6 +21,7 @@ limitations under the License.
#include "oneflow/core/ep/include/event.h"
#include "oneflow/core/ep/include/stream.h"
#include "oneflow/core/ep/include/allocation_options.h"
#include "oneflow/core/ep/include/gpu_macro.h"
namespace
oneflow
{
...
...
oneflow/core/ep/include/event.h
View file @
a715222c
...
...
@@ -18,6 +18,7 @@ limitations under the License.
#include "oneflow/core/common/util.h"
#include "oneflow/core/common/maybe.h"
#include "oneflow/core/ep/include/gpu_macro.h"
namespace
oneflow
{
...
...
oneflow/core/ep/include/gpu_macro.h
0 → 100644
View file @
a715222c
/*
Copyright 2020 The OneFlow Authors. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
#ifndef ONEFLOW_CORE_EP_GPU_MACRO_H_
#define ONEFLOW_CORE_EP_GPU_MACRO_H_
#ifdef WITH_ROCM
#include <hip/hip_runtime.h>
#define GPU(str) hip##str
#define GPURAND(str) hiprand##str
#define GPUMultiProcessorCount hipDeviceAttributeMultiprocessorCount
#define GPUMaxThreadsPerMultiProcessor hipDeviceAttributeMaxThreadsPerMultiProcessor
#define GPUMaxSharedMemoryPerBlockOptin hipDeviceAttributeSharedMemPerBlockOptin
__device__
__forceinline__
void
TRAP
()
{
asm
volatile
(
"s_trap 0;"
);
}
#else
#include <cuda.h>
#define GPU(str) cuda##str
#define GPURAND(str) curand##str
#define GPUMultiProcessorCount cudaDevAttrMultiProcessorCount
#define GPUMaxThreadsPerMultiProcessor cudaDevAttrMaxThreadsPerMultiProcessor
#define GPUMaxSharedMemoryPerBlockOptin cudaDevAttrMaxSharedMemoryPerBlockOptin
__device__
__forceinline__
void
TRAP
()
{
__trap
();
}
#endif
#endif // ONEFLOW_CORE_EP_GPU_MACRO_H_
\ No newline at end of file
oneflow/core/ep/include/primitive/binary_op.h
View file @
a715222c
...
...
@@ -32,6 +32,12 @@ enum class BinaryOp {
kMax
,
kMin
,
kPow
,
kFmod
,
kFloorDiv
,
kTruncDiv
,
kFloorMod
,
kScalarBasePowerGrad
,
kScalarExpPowerGrad
,
// Comparision
kEqual
,
kNotEqual
,
...
...
@@ -39,6 +45,8 @@ enum class BinaryOp {
kLessEqual
,
kGreaterThan
,
kGreaterEqual
,
kIsClose
,
kIsCloseEqualNan
,
// Logical
kLogicalAnd
,
kLogicalOr
,
...
...
@@ -62,7 +70,35 @@ enum class BinaryOp {
kTanhBackwardWithDyX
,
kThresholdBackwardWithDyX
,
kSigmoidBackwardWithDyY
,
kAbsBackwardWithDyX
,
kAcosBackwardWithDyX
,
kAcoshBackwardWithDyX
,
kAsinBackwardWithDyX
,
kAsinhBackwardWithDyX
,
kAtanBackwardWithDyX
,
kAtanhBackwardWithDyX
,
kCosBackwardWithDyX
,
kCoshBackwardWithDyX
,
kErfBackwardWithDyX
,
kErfcBackwardWithDyX
,
kExpBackwardWithDyX
,
kExpm1BackwardWithDyX
,
kLgammaBackwardWithDyX
,
kLogBackwardWithDyX
,
kLog2BackwardWithDyX
,
kLog10BackwardWithDyX
,
kLog1pBackwardWithDyX
,
kLogSigmoidBackwardWithDyX
,
kReciprocalBackwardWithDyX
,
kReciprocalNoNanBackwardWithDyX
,
kRsqrtBackwardWithDyX
,
kSinBackwardWithDyX
,
kSinhBackwardWithDyX
,
kSqrtBackwardWithDyX
,
kSquareBackwardWithDyX
,
kTanBackwardWithDyX
,
kFastGeluBackwardWithDyX
,
kQuickGeluBackwardWithDyX
,
};
}
...
...
oneflow/core/ep/include/primitive/broadcast_elementwise_unary.h
0 → 100644
View file @
a715222c
/*
Copyright 2020 The OneFlow Authors. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
#ifndef ONEFLOW_CORE_EP_PRIMITIVE_BROADCAST_ELEMENTWISE_UNARY_H_
#define ONEFLOW_CORE_EP_PRIMITIVE_BROADCAST_ELEMENTWISE_UNARY_H_
#include "oneflow/core/ep/include/primitive/primitive.h"
#include "oneflow/core/ep/include/primitive/unary_op.h"
#include "oneflow/core/common/scalar.h"
namespace
oneflow
{
namespace
ep
{
namespace
primitive
{
class
BroadcastElementwiseUnary
:
public
Primitive
{
public:
OF_DISALLOW_COPY_AND_MOVE
(
BroadcastElementwiseUnary
);
BroadcastElementwiseUnary
()
=
default
;
~
BroadcastElementwiseUnary
()
override
=
default
;
virtual
void
Launch
(
Stream
*
stream
,
size_t
num_src_dims
,
const
int64_t
*
src_dims
,
const
int64_t
*
src_strides
,
const
void
*
src
,
size_t
num_dst_dims
,
const
int64_t
*
dst_dims
,
const
int64_t
*
dst_strides
,
void
*
dst
)
=
0
;
virtual
void
Launch
(
Stream
*
stream
,
size_t
num_src_dims
,
const
int64_t
*
src_dims
,
const
void
*
src
,
size_t
num_dst_dims
,
const
int64_t
*
dst_dims
,
void
*
dst
)
=
0
;
};
class
BroadcastElementwiseUnaryFactory
:
public
Factory
<
BroadcastElementwiseUnary
>
{
public:
OF_DISALLOW_COPY_AND_MOVE
(
BroadcastElementwiseUnaryFactory
);
BroadcastElementwiseUnaryFactory
()
=
default
;
~
BroadcastElementwiseUnaryFactory
()
override
=
default
;
virtual
std
::
unique_ptr
<
BroadcastElementwiseUnary
>
New
(
UnaryOp
op
,
DataType
src_type
,
DataType
dst_type
,
size_t
max_num_dims
)
=
0
;
virtual
std
::
unique_ptr
<
BroadcastElementwiseUnary
>
New
(
UnaryOp
op
,
DataType
src_type
,
DataType
dst_type
,
size_t
max_num_dims
,
Scalar
attr0
)
=
0
;
virtual
std
::
unique_ptr
<
BroadcastElementwiseUnary
>
New
(
UnaryOp
op
,
DataType
src_type
,
DataType
dst_type
,
size_t
max_num_dims
,
Scalar
attr0
,
Scalar
attr1
)
=
0
;
};
}
// namespace primitive
}
// namespace ep
}
// namespace oneflow
#endif // ONEFLOW_CORE_EP_PRIMITIVE_BROADCAST_ELEMENTWISE_UNARY_H_
oneflow/core/ep/include/primitive/fast_integer_math.h
View file @
a715222c
...
...
@@ -16,8 +16,10 @@ limitations under the License.
#ifndef ONEFLOW_CORE_EP_PRIMITIVE_FAST_INTEGER_MATH_H_
#define ONEFLOW_CORE_EP_PRIMITIVE_FAST_INTEGER_MATH_H_
#include "oneflow/core/common/data_type.h"
#ifdef WITH_ROCM
#include "hip/device_functions.h" // /opt/rocm/hip/include/hip
#endif
#include <cassert>
#include "device_functions.h" // /opt/rocm/hip/include/hip
namespace
oneflow
{
...
...
oneflow/core/ep/include/primitive/tensor_fill.h
0 → 100644
View file @
a715222c
/*
Copyright 2020 The OneFlow Authors. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
#ifndef ONEFLOW_CORE_EP_PRIMITIVE_TENSOR_FILL_H_
#define ONEFLOW_CORE_EP_PRIMITIVE_TENSOR_FILL_H_
#include "oneflow/core/ep/include/primitive/primitive.h"
namespace
oneflow
{
namespace
ep
{
namespace
primitive
{
class
TensorFill
:
public
Primitive
{
public:
OF_DISALLOW_COPY_AND_MOVE
(
TensorFill
);
TensorFill
()
=
default
;
~
TensorFill
()
override
=
default
;
virtual
void
Launch
(
Stream
*
stream
,
const
void
*
src
,
void
*
dst
,
size_t
count
)
=
0
;
};
class
TensorFillFactory
:
public
Factory
<
TensorFill
>
{
public:
OF_DISALLOW_COPY_AND_MOVE
(
TensorFillFactory
);
TensorFillFactory
()
=
default
;
~
TensorFillFactory
()
override
=
default
;
virtual
std
::
unique_ptr
<
TensorFill
>
New
(
DataType
data_type
)
=
0
;
};
}
// namespace primitive
}
// namespace ep
}
// namespace oneflow
#endif // ONEFLOW_CORE_EP_PRIMITIVE_TENSOR_FILL_H_
oneflow/core/ep/include/primitive/unary_op.h
View file @
a715222c
...
...
@@ -22,6 +22,7 @@ namespace ep {
namespace
primitive
{
enum
class
UnaryOp
{
kIdentity
,
// activation op
kElu
,
kCelu
,
...
...
@@ -40,13 +41,53 @@ enum class UnaryOp {
kSoftPlus
,
kTanh
,
kThreshold
,
kFastGelu
,
kQuickGelu
,
// math op
kAbs
,
kAcos
,
kAcosh
,
kAsin
,
kAsinh
,
kAtan
,
kAtanh
,
kCeil
,
kCos
,
kCosh
,
kErf
,
kErfc
,
kExp
,
kExpm1
,
kFloor
,
kLgamma
,
kLog
,
kLog2
,
kLog10
,
kLog1p
,
kLogSigmoid
,
kNegative
,
kReciprocal
,
kReciprocalNoNan
,
kRint
,
kRound
,
kRsqrt
,
kSigmoid
,
kSign
,
kSin
,
kSinh
,
kSqrt
,
kSquare
,
kTan
,
kTrunc
,
kNotEqualZero
,
// logical op
kLogicalNot
,
// utils op
kIsInf
,
kIsNan
,
kIsFinite
,
kNanAssign
,
};
}
...
...
oneflow/core/ep/include/stream.h
View file @
a715222c
...
...
@@ -20,6 +20,7 @@ limitations under the License.
#include "oneflow/core/common/device_type.h"
#include "oneflow/core/common/maybe.h"
#include "oneflow/core/ep/include/event.h"
#include "oneflow/core/ep/include/gpu_macro.h"
namespace
oneflow
{
...
...
@@ -37,6 +38,7 @@ class Stream {
virtual
Device
*
device
()
const
=
0
;
virtual
Maybe
<
void
>
Sync
()
=
0
;
virtual
void
RecordEvent
(
Event
*
event
)
=
0
;
virtual
Maybe
<
void
>
GetAsyncError
()
{
return
Maybe
<
void
>::
Ok
();
}
virtual
Maybe
<
void
>
OnExecutionContextSetup
()
{
return
Maybe
<
void
>::
Ok
();
}
virtual
Maybe
<
void
>
OnExecutionContextTeardown
()
{
return
Maybe
<
void
>::
Ok
();
}
...
...
oneflow/core/ep/rocm/cuda_device.cpp
deleted
100644 → 0
View file @
f262efc9
/*
Copyright 2020 The OneFlow Authors. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
#include "oneflow/core/ep/rocm/cuda_device.h"
#include "oneflow/core/ep/rocm/cuda_event.h"
#include "oneflow/core/ep/rocm/cuda_stream.h"
#ifdef WITH_ROCM
#include <hip/hip_runtime.h>
#include <hip/hip_fp16.h>
// #if CUDA_VERSION >= 11000
// #include <cuda_bf16.h>
// #endif
namespace
oneflow
{
namespace
ep
{
namespace
{
constexpr
size_t
kDefaultConstBufElementCount
=
1024
*
1024
;
template
<
typename
T
>
void
CreateConstBuffer
(
void
**
buf
,
T
value
,
size_t
n
)
{
OF_CUDA_CHECK
(
hipMalloc
(
buf
,
n
*
sizeof
(
T
)));
std
::
vector
<
T
>
host
(
n
,
value
);
OF_CUDA_CHECK
(
hipMemcpy
(
*
buf
,
host
.
data
(),
n
*
sizeof
(
T
),
hipMemcpyDefault
));
}
}
// namespace
CudaDevice
::
CudaDevice
(
int
device_index
,
DeviceManager
*
device_manager
)
:
device_index_
(
device_index
),
event_flags_
{},
properties_
{},
device_manager_
(
device_manager
),
const_buf_elem_cnt_
(
0
),
const_zeros_buffer_
(
nullptr
),
const_ones_buffer_fp32_
(
nullptr
),
const_ones_buffer_fp16_
(
nullptr
),
const_ones_buffer_bf16_
(
nullptr
)
{
CudaCurrentDeviceGuard
guard
(
device_index_
);
OF_CUDA_CHECK
(
hipGetDeviceProperties
(
&
properties_
,
device_index_
));
event_flags_
=
hipEventDisableTiming
;
if
(
ParseBooleanFromEnv
(
"ONEFLOW_STREAM_CUDA_EVENT_FLAG_BLOCKING_SYNC"
,
false
))
{
event_flags_
|=
hipEventBlockingSync
;
}
const_buf_elem_cnt_
=
ParseIntegerFromEnv
(
"ONEFLOW_EP_CUDA_CONST_BUFFER_ELEMENT_COUNT"
,
kDefaultConstBufElementCount
);
if
(
const_buf_elem_cnt_
>
0
)
{
CreateConstBuffer
<
float
>
(
&
const_zeros_buffer_
,
static_cast
<
float
>
(
0
),
const_buf_elem_cnt_
);
CreateConstBuffer
<
float
>
(
&
const_ones_buffer_fp32_
,
static_cast
<
float
>
(
1.0
),
const_buf_elem_cnt_
);
CreateConstBuffer
<
half
>
(
&
const_ones_buffer_fp16_
,
static_cast
<
half
>
(
1.0
),
const_buf_elem_cnt_
);
// #if CUDA_VERSION >= 11000
// CreateConstBuffer<nv_bfloat16>(&const_ones_buffer_bf16_, static_cast<nv_bfloat16>(1.0),
// const_buf_elem_cnt_);
// #endif
}
}
CudaDevice
::~
CudaDevice
()
{
CudaCurrentDeviceGuard
guard
(
device_index_
);
for
(
auto
*
event
:
events_
)
{
delete
event
;
}
OF_CUDA_CHECK
(
hipFree
(
const_zeros_buffer_
));
OF_CUDA_CHECK
(
hipFree
(
const_ones_buffer_fp32_
));
OF_CUDA_CHECK
(
hipFree
(
const_ones_buffer_fp16_
));
OF_CUDA_CHECK
(
hipFree
(
const_ones_buffer_bf16_
));
}
void
CudaDevice
::
SetAsActiveDevice
()
{
OF_CUDA_CHECK
(
hipSetDevice
(
device_index_
));
}
Stream
*
CudaDevice
::
CreateStream
()
{
CudaCurrentDeviceGuard
guard
(
device_index_
);
return
new
CudaStream
(
this
);
}
void
CudaDevice
::
DestroyStream
(
Stream
*
stream
)
{
CudaCurrentDeviceGuard
guard
(
device_index_
);
delete
stream
;
}
void
CudaDevice
::
CreateEvents
(
Event
**
events
,
size_t
count
)
{
size_t
copied
=
0
;
{
std
::
lock_guard
<
std
::
mutex
>
lock
(
events_mutex_
);
copied
=
std
::
min
(
count
,
events_
.
size
());
size_t
offset
=
events_
.
size
()
-
copied
;
std
::
copy
(
events_
.
begin
()
+
offset
,
events_
.
end
(),
events
);
events_
.
resize
(
offset
);
}
if
(
copied
!=
count
)
{
CudaCurrentDeviceGuard
guard
(
device_index_
);
for
(
size_t
i
=
copied
;
i
<
count
;
++
i
)
{
events
[
i
]
=
new
CudaEvent
(
event_flags_
);
}
}
}
void
CudaDevice
::
DestroyEvents
(
Event
**
events
,
size_t
count
)
{
std
::
lock_guard
<
std
::
mutex
>
lock
(
events_mutex_
);
events_
.
insert
(
events_
.
end
(),
events
,
events
+
count
);
}
Maybe
<
void
>
CudaDevice
::
Alloc
(
const
AllocationOptions
&
options
,
void
**
ptr
,
size_t
size
)
{
CudaCurrentDeviceGuard
guard
(
device_index_
);
CHECK
(
!
options
.
HasPinnedDevice
());
hipError_t
err
=
hipMalloc
(
ptr
,
size
);
if
(
err
!=
hipSuccess
)
{
return
Error
::
RuntimeError
()
<<
hipGetErrorString
(
err
);
}
else
{
return
Maybe
<
void
>::
Ok
();
}
}
void
CudaDevice
::
Free
(
const
AllocationOptions
&
attr
,
void
*
ptr
)
{
CudaCurrentDeviceGuard
guard
(
device_index_
);
OF_CUDA_CHECK
(
hipFree
(
ptr
));
}
Maybe
<
void
>
CudaDevice
::
AllocPinned
(
const
AllocationOptions
&
options
,
void
**
ptr
,
size_t
size
)
{
CudaCurrentDeviceGuard
guard
(
device_index_
);
hipError_t
err
=
NumaAwareCudaMallocHost
(
device_index_
,
ptr
,
size
);
if
(
err
!=
hipSuccess
)
{
return
Error
::
RuntimeError
()
<<
hipGetErrorString
(
err
);
}
else
{
return
Maybe
<
void
>::
Ok
();
}
}
void
CudaDevice
::
FreePinned
(
const
AllocationOptions
&
options
,
void
*
ptr
)
{
CudaCurrentDeviceGuard
guard
(
device_index_
);
OF_CUDA_CHECK
(
hipHostFree
(
ptr
));
}
const
hipDeviceProp_t
&
CudaDevice
::
properties
()
const
{
return
properties_
;
}
const
void
*
CudaDevice
::
GetConstZeros
(
DataType
data_type
,
size_t
n
)
const
{
if
(
GetSizeOfDataType
(
data_type
)
*
n
<=
GetSizeOfDataType
(
DataType
::
kFloat
)
*
const_buf_elem_cnt_
)
{
return
const_zeros_buffer_
;
}
else
{
return
nullptr
;
}
}
const
void
*
CudaDevice
::
GetConstOnes
(
DataType
data_type
,
size_t
n
)
const
{
if
(
n
<=
const_buf_elem_cnt_
)
{
if
(
data_type
==
DataType
::
kFloat
)
{
return
const_ones_buffer_fp32_
;
}
else
if
(
data_type
==
DataType
::
kFloat16
)
{
return
const_ones_buffer_fp16_
;
}
else
if
(
data_type
==
DataType
::
kBFloat16
)
{
return
const_ones_buffer_bf16_
;
}
else
{
return
nullptr
;
}
}
else
{
return
nullptr
;
}
}
}
// namespace ep
}
// namespace oneflow
#endif // WITH_ROCM
oneflow/core/ep/rocm/cuda_device.h
deleted
100644 → 0
View file @
f262efc9
/*
Copyright 2020 The OneFlow Authors. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
#ifndef ONEFLOW_CORE_EP_ROCM_CUDA_DEVICE_H_
#define ONEFLOW_CORE_EP_ROCM_CUDA_DEVICE_H_
#include "oneflow/core/ep/include/device.h"
#include "oneflow/core/common/data_type.h"
#ifdef WITH_ROCM
#include <hip/hip_runtime.h>
namespace
oneflow
{
namespace
ep
{
class
CudaDevice
:
public
Device
{
public:
OF_DISALLOW_COPY_AND_MOVE
(
CudaDevice
);
explicit
CudaDevice
(
int
device_index
,
DeviceManager
*
device_manager
);
~
CudaDevice
()
override
;
void
SetAsActiveDevice
()
override
;
DeviceType
device_type
()
const
override
{
return
DeviceType
::
kCUDA
;
}
size_t
device_index
()
const
override
{
return
device_index_
;
}
DeviceManager
*
device_manager
()
const
override
{
return
device_manager_
;
}
Stream
*
CreateStream
()
override
;
void
DestroyStream
(
Stream
*
stream
)
override
;
void
CreateEvents
(
Event
**
events
,
size_t
count
)
override
;
void
DestroyEvents
(
Event
**
events
,
size_t
count
)
override
;
Maybe
<
void
>
Alloc
(
const
AllocationOptions
&
options
,
void
**
ptr
,
size_t
size
)
override
;
void
Free
(
const
AllocationOptions
&
options
,
void
*
ptr
)
override
;
Maybe
<
void
>
AllocPinned
(
const
AllocationOptions
&
options
,
void
**
ptr
,
size_t
size
)
override
;
void
FreePinned
(
const
AllocationOptions
&
options
,
void
*
ptr
)
override
;
const
hipDeviceProp_t
&
properties
()
const
;
const
void
*
GetConstZeros
(
DataType
data_type
,
size_t
n
)
const
;
const
void
*
GetConstOnes
(
DataType
data_type
,
size_t
n
)
const
;
private:
int
device_index_
;
std
::
mutex
events_mutex_
;
std
::
vector
<
Event
*>
events_
;
unsigned
int
event_flags_
;
hipDeviceProp_t
properties_
;
DeviceManager
*
device_manager_
;
int64_t
const_buf_elem_cnt_
;
void
*
const_zeros_buffer_
;
void
*
const_ones_buffer_fp32_
;
void
*
const_ones_buffer_fp16_
;
void
*
const_ones_buffer_bf16_
;
};
}
// namespace ep
}
// namespace oneflow
#endif // WITH_ROCM
#endif // ONEFLOW_CORE_EP_ROCM_CUDA_DEVICE_H_
oneflow/core/ep/rocm/cuda_device_manager.cpp
deleted
100644 → 0
View file @
f262efc9
/*
Copyright 2020 The OneFlow Authors. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
#include "oneflow/core/ep/rocm/cuda_device_manager.h"
#include "oneflow/core/device/cuda_util.h"
#ifdef WITH_ROCM
namespace
oneflow
{
namespace
ep
{
CudaDeviceManager
::
CudaDeviceManager
(
DeviceManagerRegistry
*
registry
)
:
registry_
(
registry
)
{}
CudaDeviceManager
::~
CudaDeviceManager
()
=
default
;
DeviceManagerRegistry
*
CudaDeviceManager
::
registry
()
const
{
return
registry_
;
}
std
::
shared_ptr
<
Device
>
CudaDeviceManager
::
GetDevice
(
size_t
device_index
)
{
std
::
lock_guard
<
std
::
mutex
>
lock
(
devices_mutex_
);
if
(
device_index
<
devices_
.
size
()
&&
devices_
.
at
(
device_index
))
{
return
devices_
.
at
(
device_index
);
}
auto
device
=
std
::
make_shared
<
CudaDevice
>
(
device_index
,
this
);
if
(
device_index
>=
devices_
.
size
())
{
devices_
.
resize
(
device_index
+
1
);
}
devices_
.
at
(
device_index
)
=
device
;
return
device
;
}
size_t
CudaDeviceManager
::
GetDeviceCount
(
size_t
primary_device_index
)
{
CudaCurrentDeviceGuard
guard
(
primary_device_index
);
return
this
->
GetDeviceCount
();
}
size_t
CudaDeviceManager
::
GetDeviceCount
()
{
int
count
=
0
;
hipError_t
err
=
hipGetDeviceCount
(
&
count
);
if
(
err
==
hipErrorNoDevice
||
err
==
hipErrorInsufficientDriver
)
{
return
0
;
}
OF_CUDA_CHECK
(
err
);
return
count
;
}
size_t
CudaDeviceManager
::
GetActiveDeviceIndex
()
{
int
device
=
0
;
OF_CUDA_CHECK
(
hipGetDevice
(
&
device
));
return
static_cast
<
size_t
>
(
device
);
}
void
CudaDeviceManager
::
SetActiveDeviceByIndex
(
size_t
device_index
)
{
OF_CUDA_CHECK
(
hipSetDevice
(
static_cast
<
int
>
(
device_index
)));
}
}
// namespace ep
}
// namespace oneflow
#endif // WITH_ROCM
Prev
1
…
19
20
21
22
23
24
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment