Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
Oneflow
Commits
8f7de847
Commit
8f7de847
authored
Apr 25, 2023
by
yuguo960516yuguo
Browse files
dtk
parent
f262efc9
Pipeline
#248
failed with stages
in 0 seconds
Changes
121
Pipelines
1
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
3026 additions
and
3026 deletions
+3026
-3026
oneflow/core/ndarray/xpu_ndarray_assign.hip.cpp
oneflow/core/ndarray/xpu_ndarray_assign.hip.cpp
+61
-61
oneflow/core/profiler/event.cpp
oneflow/core/profiler/event.cpp
+90
-90
oneflow/core/profiler/event.h
oneflow/core/profiler/event.h
+186
-186
oneflow/core/profiler/event_recorder.h
oneflow/core/profiler/event_recorder.h
+60
-60
oneflow/core/vm/sync_vm_mode_guard.h
oneflow/core/vm/sync_vm_mode_guard.h
+38
-38
oneflow/user/kernels/adaptive_pool_gpu_kernel.hip.cpp
oneflow/user/kernels/adaptive_pool_gpu_kernel.hip.cpp
+295
-295
oneflow/user/kernels/affine_grid_kernel.hip.cpp
oneflow/user/kernels/affine_grid_kernel.hip.cpp
+132
-132
oneflow/user/kernels/arange_kernel_util.hip.cpp
oneflow/user/kernels/arange_kernel_util.hip.cpp
+47
-47
oneflow/user/kernels/arg_sort_kernel.hip.cpp
oneflow/user/kernels/arg_sort_kernel.hip.cpp
+147
-147
oneflow/user/kernels/arg_where_kernel_util.hip.cpp
oneflow/user/kernels/arg_where_kernel_util.hip.cpp
+141
-141
oneflow/user/kernels/argmax_kernel.hip.cpp
oneflow/user/kernels/argmax_kernel.hip.cpp
+193
-193
oneflow/user/kernels/as_strided_kernel.hip.cpp
oneflow/user/kernels/as_strided_kernel.hip.cpp
+198
-198
oneflow/user/kernels/assign_if_kernel.hip.cpp
oneflow/user/kernels/assign_if_kernel.hip.cpp
+75
-75
oneflow/user/kernels/avg_pool_kernel.hip.cpp
oneflow/user/kernels/avg_pool_kernel.hip.cpp
+199
-199
oneflow/user/kernels/batch_gather_kernel_util.hip.cpp
oneflow/user/kernels/batch_gather_kernel_util.hip.cpp
+102
-102
oneflow/user/kernels/binary_cross_entropy_kernel.hip.cpp
oneflow/user/kernels/binary_cross_entropy_kernel.hip.cpp
+203
-203
oneflow/user/kernels/binary_cross_entropy_with_logits_kernel.hip.cpp
...r/kernels/binary_cross_entropy_with_logits_kernel.hip.cpp
+372
-372
oneflow/user/kernels/binary_cross_entropy_with_logits_mean_kernel.hip.cpp
...nels/binary_cross_entropy_with_logits_mean_kernel.hip.cpp
+276
-276
oneflow/user/kernels/broadcast_pow_grad_kernel.hip.cpp
oneflow/user/kernels/broadcast_pow_grad_kernel.hip.cpp
+87
-87
oneflow/user/kernels/categorical_ordinal_encode_kernel_util.hip.cpp
...er/kernels/categorical_ordinal_encode_kernel_util.hip.cpp
+124
-124
No files found.
Too many changes to show.
To preserve performance only
121 of 121+
files are displayed.
Plain diff
Email patch
oneflow/core/ndarray/xpu_ndarray_assign.hip.cpp
View file @
8f7de847
/*
Copyright 2020 The OneFlow Authors. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
#include "hip/hip_runtime.h"
#include "oneflow/core/ndarray/ndarray_assign_core.h"
#include "oneflow/core/device/cuda_util.h"
#include "oneflow/core/kernel/kernel_util.h"
namespace
oneflow
{
namespace
{
template
<
typename
T
,
typename
X
,
int
NDIMS
>
__global__
void
NdarrayAssignReducedGpu
(
XpuVarNdarray
<
T
>
y
,
const
XpuReducedNdarray
<
X
,
NDIMS
>
reduced
)
{
NdarrayAssignCore
<
T
,
X
,
NDIMS
>::
Assign
(
y
,
reduced
);
}
template
<
typename
T
,
typename
X
,
int
NDIMS
>
__global__
void
NdarrayAssignGpu
(
XpuVarNdarray
<
T
>
y
,
const
XpuVarNdarray
<
const
X
>
x
)
{
NdarrayAssignCore
<
T
,
X
,
NDIMS
>::
Assign
(
y
,
x
);
}
}
// namespace
template
<
typename
T
,
typename
X
,
int
NDIMS
>
struct
NdarrayAssignCoreWrapper
<
DeviceType
::
kCUDA
,
T
,
X
,
NDIMS
>
final
{
static
void
Assign
(
ep
::
Stream
*
stream
,
XpuVarNdarray
<
T
>*
y
,
const
XpuReducedNdarray
<
X
,
NDIMS
>&
reduced
)
{
size_t
n
=
y
->
host_shape
().
HostElemNum
();
RUN_CUDA_KERNEL
((
NdarrayAssignReducedGpu
<
T
,
X
,
NDIMS
>
),
stream
,
n
,
*
y
,
reduced
);
}
static
void
Assign
(
ep
::
Stream
*
ctx
,
const
XpuVarNdarray
<
T
>&
y
,
const
XpuVarNdarray
<
const
X
>&
x
)
{
size_t
n
=
y
.
host_shape
().
HostElemNum
();
if
(
n
==
0
)
{
return
;
}
RUN_CUDA_KERNEL
((
NdarrayAssignGpu
<
T
,
X
,
NDIMS
>
),
ctx
,
n
,
y
,
x
);
}
};
#define INSTANTIATE_NDARRAY_ASSIGN(ret_dtype_pair, dtype_pair, NDIMS) \
template
struct
NdarrayAssignCoreWrapper
<
DeviceType
::
kCUDA
,
OF_PP_PAIR_FIRST
(
ret_dtype_pair
),
\
OF_PP_PAIR_FIRST
(
dtype_pair
),
NDIMS
>;
OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE
(
INSTANTIATE_NDARRAY_ASSIGN
,
ARITHMETIC_DATA_TYPE_SEQ
UNSIGNED_INT_DATA_TYPE_SEQ
BOOL_DATA_TYPE_SEQ
,
ARITHMETIC_DATA_TYPE_SEQ
UNSIGNED_INT_DATA_TYPE_SEQ
BOOL_DATA_TYPE_SEQ
,
DIM_SEQ
);
OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE
(
INSTANTIATE_NDARRAY_ASSIGN
,
HALF_DATA_TYPE_SEQ
,
HALF_DATA_TYPE_SEQ
,
DIM_SEQ
);
/*
Copyright 2020 The OneFlow Authors. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
#include "hip/hip_runtime.h"
#include "oneflow/core/ndarray/ndarray_assign_core.h"
#include "oneflow/core/device/cuda_util.h"
#include "oneflow/core/kernel/kernel_util.h"
namespace
oneflow
{
namespace
{
template
<
typename
T
,
typename
X
,
int
NDIMS
>
__global__
void
NdarrayAssignReducedGpu
(
XpuVarNdarray
<
T
>
y
,
const
XpuReducedNdarray
<
X
,
NDIMS
>
reduced
)
{
NdarrayAssignCore
<
T
,
X
,
NDIMS
>::
Assign
(
y
,
reduced
);
}
template
<
typename
T
,
typename
X
,
int
NDIMS
>
__global__
void
NdarrayAssignGpu
(
XpuVarNdarray
<
T
>
y
,
const
XpuVarNdarray
<
const
X
>
x
)
{
NdarrayAssignCore
<
T
,
X
,
NDIMS
>::
Assign
(
y
,
x
);
}
}
// namespace
template
<
typename
T
,
typename
X
,
int
NDIMS
>
struct
NdarrayAssignCoreWrapper
<
DeviceType
::
kCUDA
,
T
,
X
,
NDIMS
>
final
{
static
void
Assign
(
ep
::
Stream
*
stream
,
XpuVarNdarray
<
T
>*
y
,
const
XpuReducedNdarray
<
X
,
NDIMS
>&
reduced
)
{
size_t
n
=
y
->
host_shape
().
HostElemNum
();
RUN_CUDA_KERNEL
((
NdarrayAssignReducedGpu
<
T
,
X
,
NDIMS
>
),
stream
,
n
,
*
y
,
reduced
);
}
static
void
Assign
(
ep
::
Stream
*
ctx
,
const
XpuVarNdarray
<
T
>&
y
,
const
XpuVarNdarray
<
const
X
>&
x
)
{
size_t
n
=
y
.
host_shape
().
HostElemNum
();
if
(
n
==
0
)
{
return
;
}
RUN_CUDA_KERNEL
((
NdarrayAssignGpu
<
T
,
X
,
NDIMS
>
),
ctx
,
n
,
y
,
x
);
}
};
#define INSTANTIATE_NDARRAY_ASSIGN(ret_dtype_pair, dtype_pair, NDIMS) \
template struct NdarrayAssignCoreWrapper<DeviceType::kCUDA, OF_PP_PAIR_FIRST(ret_dtype_pair), \
OF_PP_PAIR_FIRST(dtype_pair), NDIMS>;
OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE
(
INSTANTIATE_NDARRAY_ASSIGN
,
ARITHMETIC_DATA_TYPE_SEQ
UNSIGNED_INT_DATA_TYPE_SEQ
BOOL_DATA_TYPE_SEQ
,
ARITHMETIC_DATA_TYPE_SEQ
UNSIGNED_INT_DATA_TYPE_SEQ
BOOL_DATA_TYPE_SEQ
,
DIM_SEQ
);
OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE
(
INSTANTIATE_NDARRAY_ASSIGN
,
HALF_DATA_TYPE_SEQ
,
HALF_DATA_TYPE_SEQ
,
DIM_SEQ
);
}
// namespace oneflow
\ No newline at end of file
oneflow/core/profiler/event.cpp
View file @
8f7de847
/*
Copyright 2020 The OneFlow Authors. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
#include "fmt/core.h"
#include "fmt/format.h"
#include "oneflow/core/profiler/event.h"
#include "oneflow/core/profiler/util.h"
using
json
=
nlohmann
::
json
;
namespace
oneflow
{
namespace
profiler
{
nlohmann
::
json
IEvent
::
ToJson
()
{
return
json
{{
"name"
,
name_
},
{
"time"
,
GetDuration
<
double
>
()},
{
"input_shapes"
,
"-"
}};
}
void
IEvent
::
SetStartedAt
(
double
t
)
{
started_at_
=
t
;
}
void
IEvent
::
SetFinishedAt
(
double
t
)
{
finished_at_
=
t
;
}
void
IEvent
::
Start
()
{
SetStartedAt
(
GetTimeNow
());
}
void
IEvent
::
Finish
()
{
SetFinishedAt
(
GetTimeNow
());
}
bool
IEvent
::
IsChildOf
(
const
IEvent
*
e
)
{
if
(
!
e
)
{
return
false
;
}
if
(
this
==
e
)
{
return
false
;
}
return
GetStartedAt
<
double
>
()
>=
e
->
GetStartedAt
<
double
>
()
&&
GetFinishedAt
<
double
>
()
<=
e
->
GetFinishedAt
<
double
>
();
}
const
std
::
string
&
IEvent
::
GetName
()
const
{
return
name_
;
}
std
::
string
CustomEvent
::
Key
()
{
return
name_
;
}
nlohmann
::
json
CustomEvent
::
ToJson
()
{
auto
j
=
IEvent
::
ToJson
();
j
[
"type"
]
=
EventType
::
kCustom
;
j
[
"custom_type"
]
=
type_
;
return
j
;
}
std
::
shared_ptr
<
CustomEvent
>
CustomEvent
::
Create
(
const
std
::
string
&
name
,
CustomEventType
type
)
{
return
std
::
shared_ptr
<
CustomEvent
>
(
new
CustomEvent
(
name
,
type
));
}
std
::
string
KernelEvent
::
Key
()
{
return
fmt
::
format
(
"{}.{}"
,
name_
,
GetFormatedInputShapes
());
}
nlohmann
::
json
KernelEvent
::
ToJson
()
{
auto
j
=
IEvent
::
ToJson
();
j
[
"type"
]
=
EventType
::
kOneflowKernel
;
j
[
"input_shapes"
]
=
GetFormatedInputShapes
();
#if defined(WITH_CUDA) || defined(WITH_ROCM)
j
[
"memory_size"
]
=
memory_size_
;
if
(
!
children_
.
empty
())
{
j
[
"children"
]
=
children_
;
}
#endif // WITH_CUDA
return
j
;
}
std
::
shared_ptr
<
KernelEvent
>
KernelEvent
::
Create
(
const
std
::
string
&
name
,
const
std
::
function
<
std
::
vector
<
Shape
>
(
void
)
>&
shape_getter
)
{
return
std
::
shared_ptr
<
KernelEvent
>
(
new
KernelEvent
(
name
,
shape_getter
));
}
std
::
string
KernelEvent
::
GetFormatedInputShapes
(
size_t
max_num_to_format
)
{
if
(
input_shapes_
.
size
()
==
0
)
{
return
"-"
;
}
std
::
vector
<
std
::
string
>
shapes_formated
(
std
::
min
(
input_shapes_
.
size
(),
max_num_to_format
));
for
(
auto
i
=
0
;
i
<
shapes_formated
.
size
();
++
i
)
{
const
std
::
string
current_shape
=
input_shapes_
[
i
].
ToString
();
shapes_formated
[
i
]
=
current_shape
==
"()"
?
"scalar"
:
current_shape
;
}
if
(
input_shapes_
.
size
()
>
max_num_to_format
)
{
shapes_formated
.
emplace_back
(
"..."
);
}
return
fmt
::
format
(
"[{}]"
,
fmt
::
join
(
shapes_formated
,
", "
));
}
}
// namespace profiler
/*
Copyright 2020 The OneFlow Authors. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
#include "fmt/core.h"
#include "fmt/format.h"
#include "oneflow/core/profiler/event.h"
#include "oneflow/core/profiler/util.h"
using
json
=
nlohmann
::
json
;
namespace
oneflow
{
namespace
profiler
{
nlohmann
::
json
IEvent
::
ToJson
()
{
return
json
{{
"name"
,
name_
},
{
"time"
,
GetDuration
<
double
>
()},
{
"input_shapes"
,
"-"
}};
}
void
IEvent
::
SetStartedAt
(
double
t
)
{
started_at_
=
t
;
}
void
IEvent
::
SetFinishedAt
(
double
t
)
{
finished_at_
=
t
;
}
void
IEvent
::
Start
()
{
SetStartedAt
(
GetTimeNow
());
}
void
IEvent
::
Finish
()
{
SetFinishedAt
(
GetTimeNow
());
}
bool
IEvent
::
IsChildOf
(
const
IEvent
*
e
)
{
if
(
!
e
)
{
return
false
;
}
if
(
this
==
e
)
{
return
false
;
}
return
GetStartedAt
<
double
>
()
>=
e
->
GetStartedAt
<
double
>
()
&&
GetFinishedAt
<
double
>
()
<=
e
->
GetFinishedAt
<
double
>
();
}
const
std
::
string
&
IEvent
::
GetName
()
const
{
return
name_
;
}
std
::
string
CustomEvent
::
Key
()
{
return
name_
;
}
nlohmann
::
json
CustomEvent
::
ToJson
()
{
auto
j
=
IEvent
::
ToJson
();
j
[
"type"
]
=
EventType
::
kCustom
;
j
[
"custom_type"
]
=
type_
;
return
j
;
}
std
::
shared_ptr
<
CustomEvent
>
CustomEvent
::
Create
(
const
std
::
string
&
name
,
CustomEventType
type
)
{
return
std
::
shared_ptr
<
CustomEvent
>
(
new
CustomEvent
(
name
,
type
));
}
std
::
string
KernelEvent
::
Key
()
{
return
fmt
::
format
(
"{}.{}"
,
name_
,
GetFormatedInputShapes
());
}
nlohmann
::
json
KernelEvent
::
ToJson
()
{
auto
j
=
IEvent
::
ToJson
();
j
[
"type"
]
=
EventType
::
kOneflowKernel
;
j
[
"input_shapes"
]
=
GetFormatedInputShapes
();
#if defined(WITH_CUDA) || defined(WITH_ROCM)
j
[
"memory_size"
]
=
memory_size_
;
if
(
!
children_
.
empty
())
{
j
[
"children"
]
=
children_
;
}
#endif // WITH_CUDA
return
j
;
}
std
::
shared_ptr
<
KernelEvent
>
KernelEvent
::
Create
(
const
std
::
string
&
name
,
const
std
::
function
<
std
::
vector
<
Shape
>
(
void
)
>&
shape_getter
)
{
return
std
::
shared_ptr
<
KernelEvent
>
(
new
KernelEvent
(
name
,
shape_getter
));
}
std
::
string
KernelEvent
::
GetFormatedInputShapes
(
size_t
max_num_to_format
)
{
if
(
input_shapes_
.
size
()
==
0
)
{
return
"-"
;
}
std
::
vector
<
std
::
string
>
shapes_formated
(
std
::
min
(
input_shapes_
.
size
(),
max_num_to_format
));
for
(
auto
i
=
0
;
i
<
shapes_formated
.
size
();
++
i
)
{
const
std
::
string
current_shape
=
input_shapes_
[
i
].
ToString
();
shapes_formated
[
i
]
=
current_shape
==
"()"
?
"scalar"
:
current_shape
;
}
if
(
input_shapes_
.
size
()
>
max_num_to_format
)
{
shapes_formated
.
emplace_back
(
"..."
);
}
return
fmt
::
format
(
"[{}]"
,
fmt
::
join
(
shapes_formated
,
", "
));
}
}
// namespace profiler
}
// namespace oneflow
\ No newline at end of file
oneflow/core/profiler/event.h
View file @
8f7de847
/*
Copyright 2020 The OneFlow Authors. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
#ifndef ONEFLOW_CORE_PROFILER_EVENT_H_
#define ONEFLOW_CORE_PROFILER_EVENT_H_
#include <functional>
#include <memory>
#include <vector>
#include "nlohmann/json.hpp"
#include "oneflow/core/common/util.h"
#include "oneflow/core/common/shape_view.h"
namespace
oneflow
{
namespace
profiler
{
class
ProfileManager
;
enum
class
EventType
{
kCustom
,
// has three kinds
kOneflowKernel
// OneFlow cpu/cuda kernel
};
enum
class
CustomEventType
{
kDefault
,
// for record_function
kCudaKernel
,
// cuda kernel
kCudaRuntime
// something like cudaLaunchKernel
};
enum
class
EventTimeUnit
{
kNS
,
kUS
};
class
IEvent
{
public:
OF_DISALLOW_COPY_AND_MOVE
(
IEvent
);
IEvent
()
=
delete
;
IEvent
(
const
std
::
string
&
name
,
EventTimeUnit
time_unit
)
:
name_
(
name
),
time_unit_
(
time_unit
)
{}
virtual
std
::
string
Key
()
=
0
;
virtual
nlohmann
::
json
ToJson
();
virtual
~
IEvent
()
=
default
;
virtual
void
Start
();
virtual
void
Finish
();
bool
IsChildOf
(
const
IEvent
*
e
);
const
std
::
string
&
GetName
()
const
;
template
<
typename
T
>
const
T
GetDuration
(
EventTimeUnit
time_unit
=
EventTimeUnit
::
kUS
)
const
;
template
<
typename
T
>
const
T
GetStartedAt
(
EventTimeUnit
time_unit
=
EventTimeUnit
::
kUS
)
const
;
template
<
typename
T
>
const
T
GetFinishedAt
(
EventTimeUnit
time_unit
=
EventTimeUnit
::
kUS
)
const
;
protected:
virtual
void
SetStartedAt
(
double
t
);
virtual
void
SetFinishedAt
(
double
t
);
std
::
string
name_
;
EventTimeUnit
time_unit_
;
double
started_at_
=
0
;
double
finished_at_
=
0
;
};
inline
double
ConvertTime
(
double
time_
,
EventTimeUnit
src_time_unit
,
EventTimeUnit
dst_time_unit
)
{
if
(
src_time_unit
==
EventTimeUnit
::
kNS
&&
dst_time_unit
==
EventTimeUnit
::
kUS
)
{
return
time_
/
1000
;
}
if
(
src_time_unit
==
EventTimeUnit
::
kUS
&&
dst_time_unit
==
EventTimeUnit
::
kNS
)
{
return
time_
*
1000
;
}
return
time_
;
}
template
<
>
const
inline
double
IEvent
::
GetStartedAt
<
double
>
(
EventTimeUnit
time_unit
)
const
{
return
ConvertTime
(
started_at_
,
time_unit_
,
time_unit
);
}
template
<
>
const
inline
time_t
IEvent
::
GetStartedAt
<
time_t
>
(
EventTimeUnit
time_unit
)
const
{
return
static_cast
<
time_t
>
(
GetStartedAt
<
double
>
(
time_unit
));
}
template
<
>
const
inline
double
IEvent
::
GetFinishedAt
<
double
>
(
EventTimeUnit
time_unit
)
const
{
return
ConvertTime
(
finished_at_
,
time_unit_
,
time_unit
);
}
template
<
>
const
inline
time_t
IEvent
::
GetFinishedAt
<
time_t
>
(
EventTimeUnit
time_unit
)
const
{
return
static_cast
<
time_t
>
(
GetFinishedAt
<
double
>
(
time_unit
));
}
template
<
>
const
inline
double
IEvent
::
GetDuration
<
double
>
(
EventTimeUnit
time_unit
)
const
{
return
GetFinishedAt
<
double
>
(
time_unit
)
-
GetStartedAt
<
double
>
(
time_unit
);
}
template
<
>
const
inline
time_t
IEvent
::
GetDuration
<
time_t
>
(
EventTimeUnit
time_unit
)
const
{
return
static_cast
<
time_t
>
(
GetDuration
<
double
>
(
time_unit
));
}
class
CustomEvent
final
:
public
IEvent
{
public:
friend
class
ProfileManager
;
std
::
string
Key
()
override
;
nlohmann
::
json
ToJson
()
override
;
static
std
::
shared_ptr
<
CustomEvent
>
Create
(
const
std
::
string
&
name
,
CustomEventType
type
=
CustomEventType
::
kDefault
);
private:
CustomEventType
type_
;
CustomEvent
(
const
std
::
string
&
custom_name
,
CustomEventType
type
)
:
IEvent
(
custom_name
,
type
==
CustomEventType
::
kDefault
?
EventTimeUnit
::
kNS
:
EventTimeUnit
::
kUS
),
type_
(
type
)
{}
};
class
KernelEvent
final
:
public
IEvent
{
public:
std
::
string
Key
()
override
;
nlohmann
::
json
ToJson
()
override
;
static
std
::
shared_ptr
<
KernelEvent
>
Create
(
const
std
::
string
&
name
,
const
std
::
function
<
std
::
vector
<
Shape
>
(
void
)
>&
shape_getter
);
#if defined(WITH_CUDA) || defined(WITH_ROCM)
void
SetMemorySize
(
int64_t
memory_size
)
{
memory_size_
=
memory_size
;
}
void
AddChildEvent
(
const
std
::
shared_ptr
<
IEvent
>&
e
)
{
children_
.
emplace
(
e
);
}
bool
AddChildEventIfSo
(
const
std
::
shared_ptr
<
IEvent
>&
e
)
{
if
(
e
->
IsChildOf
(
dynamic_cast
<
IEvent
*>
(
this
)))
{
children_
.
emplace
(
e
);
return
true
;
}
return
false
;
}
bool
HasChildEvent
(
const
std
::
shared_ptr
<
IEvent
>&
e
)
{
return
children_
.
count
(
e
);
}
void
WalkAmongChildren
(
const
std
::
function
<
void
(
const
std
::
shared_ptr
<
IEvent
>&
e
)
>&
f
)
const
{
for
(
const
auto
&
x
:
children_
)
{
f
(
x
);
}
}
#endif // WITH_CUDA
private:
KernelEvent
(
const
std
::
string
&
kernel_name
,
const
std
::
function
<
std
::
vector
<
Shape
>
(
void
)
>&
shape_getter
)
:
IEvent
(
kernel_name
,
EventTimeUnit
::
kNS
)
{
if
(
shape_getter
)
{
input_shapes_
=
shape_getter
();
}
}
#if defined(WITH_CUDA) || defined(WITH_ROCM)
int64_t
memory_size_
=
-
1
;
std
::
set
<
std
::
shared_ptr
<
IEvent
>>
children_
;
#endif // WITH_CUDA
std
::
vector
<
Shape
>
input_shapes_
;
std
::
string
GetFormatedInputShapes
(
size_t
max_num_to_format
=
4
);
};
}
// namespace profiler
}
// namespace oneflow
namespace
nlohmann
{
inline
void
to_json
(
json
&
j
,
const
std
::
shared_ptr
<::
oneflow
::
profiler
::
IEvent
>&
event
)
{
j
=
event
->
ToJson
();
}
}
// namespace nlohmann
#endif // ONEFLOW_CORE_PROFILER_EVENT_H_
/*
Copyright 2020 The OneFlow Authors. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
#ifndef ONEFLOW_CORE_PROFILER_EVENT_H_
#define ONEFLOW_CORE_PROFILER_EVENT_H_
#include <functional>
#include <memory>
#include <vector>
#include "nlohmann/json.hpp"
#include "oneflow/core/common/util.h"
#include "oneflow/core/common/shape_view.h"
namespace
oneflow
{
namespace
profiler
{
class
ProfileManager
;
enum
class
EventType
{
kCustom
,
// has three kinds
kOneflowKernel
// OneFlow cpu/cuda kernel
};
enum
class
CustomEventType
{
kDefault
,
// for record_function
kCudaKernel
,
// cuda kernel
kCudaRuntime
// something like cudaLaunchKernel
};
enum
class
EventTimeUnit
{
kNS
,
kUS
};
class
IEvent
{
public:
OF_DISALLOW_COPY_AND_MOVE
(
IEvent
);
IEvent
()
=
delete
;
IEvent
(
const
std
::
string
&
name
,
EventTimeUnit
time_unit
)
:
name_
(
name
),
time_unit_
(
time_unit
)
{}
virtual
std
::
string
Key
()
=
0
;
virtual
nlohmann
::
json
ToJson
();
virtual
~
IEvent
()
=
default
;
virtual
void
Start
();
virtual
void
Finish
();
bool
IsChildOf
(
const
IEvent
*
e
);
const
std
::
string
&
GetName
()
const
;
template
<
typename
T
>
const
T
GetDuration
(
EventTimeUnit
time_unit
=
EventTimeUnit
::
kUS
)
const
;
template
<
typename
T
>
const
T
GetStartedAt
(
EventTimeUnit
time_unit
=
EventTimeUnit
::
kUS
)
const
;
template
<
typename
T
>
const
T
GetFinishedAt
(
EventTimeUnit
time_unit
=
EventTimeUnit
::
kUS
)
const
;
protected:
virtual
void
SetStartedAt
(
double
t
);
virtual
void
SetFinishedAt
(
double
t
);
std
::
string
name_
;
EventTimeUnit
time_unit_
;
double
started_at_
=
0
;
double
finished_at_
=
0
;
};
inline
double
ConvertTime
(
double
time_
,
EventTimeUnit
src_time_unit
,
EventTimeUnit
dst_time_unit
)
{
if
(
src_time_unit
==
EventTimeUnit
::
kNS
&&
dst_time_unit
==
EventTimeUnit
::
kUS
)
{
return
time_
/
1000
;
}
if
(
src_time_unit
==
EventTimeUnit
::
kUS
&&
dst_time_unit
==
EventTimeUnit
::
kNS
)
{
return
time_
*
1000
;
}
return
time_
;
}
template
<
>
const
inline
double
IEvent
::
GetStartedAt
<
double
>
(
EventTimeUnit
time_unit
)
const
{
return
ConvertTime
(
started_at_
,
time_unit_
,
time_unit
);
}
template
<
>
const
inline
time_t
IEvent
::
GetStartedAt
<
time_t
>
(
EventTimeUnit
time_unit
)
const
{
return
static_cast
<
time_t
>
(
GetStartedAt
<
double
>
(
time_unit
));
}
template
<
>
const
inline
double
IEvent
::
GetFinishedAt
<
double
>
(
EventTimeUnit
time_unit
)
const
{
return
ConvertTime
(
finished_at_
,
time_unit_
,
time_unit
);
}
template
<
>
const
inline
time_t
IEvent
::
GetFinishedAt
<
time_t
>
(
EventTimeUnit
time_unit
)
const
{
return
static_cast
<
time_t
>
(
GetFinishedAt
<
double
>
(
time_unit
));
}
template
<
>
const
inline
double
IEvent
::
GetDuration
<
double
>
(
EventTimeUnit
time_unit
)
const
{
return
GetFinishedAt
<
double
>
(
time_unit
)
-
GetStartedAt
<
double
>
(
time_unit
);
}
template
<
>
const
inline
time_t
IEvent
::
GetDuration
<
time_t
>
(
EventTimeUnit
time_unit
)
const
{
return
static_cast
<
time_t
>
(
GetDuration
<
double
>
(
time_unit
));
}
class
CustomEvent
final
:
public
IEvent
{
public:
friend
class
ProfileManager
;
std
::
string
Key
()
override
;
nlohmann
::
json
ToJson
()
override
;
static
std
::
shared_ptr
<
CustomEvent
>
Create
(
const
std
::
string
&
name
,
CustomEventType
type
=
CustomEventType
::
kDefault
);
private:
CustomEventType
type_
;
CustomEvent
(
const
std
::
string
&
custom_name
,
CustomEventType
type
)
:
IEvent
(
custom_name
,
type
==
CustomEventType
::
kDefault
?
EventTimeUnit
::
kNS
:
EventTimeUnit
::
kUS
),
type_
(
type
)
{}
};
class
KernelEvent
final
:
public
IEvent
{
public:
std
::
string
Key
()
override
;
nlohmann
::
json
ToJson
()
override
;
static
std
::
shared_ptr
<
KernelEvent
>
Create
(
const
std
::
string
&
name
,
const
std
::
function
<
std
::
vector
<
Shape
>
(
void
)
>&
shape_getter
);
#if defined(WITH_CUDA) || defined(WITH_ROCM)
void
SetMemorySize
(
int64_t
memory_size
)
{
memory_size_
=
memory_size
;
}
void
AddChildEvent
(
const
std
::
shared_ptr
<
IEvent
>&
e
)
{
children_
.
emplace
(
e
);
}
bool
AddChildEventIfSo
(
const
std
::
shared_ptr
<
IEvent
>&
e
)
{
if
(
e
->
IsChildOf
(
dynamic_cast
<
IEvent
*>
(
this
)))
{
children_
.
emplace
(
e
);
return
true
;
}
return
false
;
}
bool
HasChildEvent
(
const
std
::
shared_ptr
<
IEvent
>&
e
)
{
return
children_
.
count
(
e
);
}
void
WalkAmongChildren
(
const
std
::
function
<
void
(
const
std
::
shared_ptr
<
IEvent
>&
e
)
>&
f
)
const
{
for
(
const
auto
&
x
:
children_
)
{
f
(
x
);
}
}
#endif // WITH_CUDA
private:
KernelEvent
(
const
std
::
string
&
kernel_name
,
const
std
::
function
<
std
::
vector
<
Shape
>
(
void
)
>&
shape_getter
)
:
IEvent
(
kernel_name
,
EventTimeUnit
::
kNS
)
{
if
(
shape_getter
)
{
input_shapes_
=
shape_getter
();
}
}
#if defined(WITH_CUDA) || defined(WITH_ROCM)
int64_t
memory_size_
=
-
1
;
std
::
set
<
std
::
shared_ptr
<
IEvent
>>
children_
;
#endif // WITH_CUDA
std
::
vector
<
Shape
>
input_shapes_
;
std
::
string
GetFormatedInputShapes
(
size_t
max_num_to_format
=
4
);
};
}
// namespace profiler
}
// namespace oneflow
namespace
nlohmann
{
inline
void
to_json
(
json
&
j
,
const
std
::
shared_ptr
<::
oneflow
::
profiler
::
IEvent
>&
event
)
{
j
=
event
->
ToJson
();
}
}
// namespace nlohmann
#endif // ONEFLOW_CORE_PROFILER_EVENT_H_
oneflow/core/profiler/event_recorder.h
View file @
8f7de847
/*
Copyright 2020 The OneFlow Authors. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
#ifndef ONEFLOW_CORE_PROFILER_EVENT_RECORDER_H_
#define ONEFLOW_CORE_PROFILER_EVENT_RECORDER_H_
#include "oneflow/core/common/util.h"
#include "oneflow/core/profiler/event.h"
namespace
oneflow
{
namespace
profiler
{
class
EventRecorder
{
public:
using
ShapeGetterFuncType
=
std
::
function
<
std
::
vector
<
Shape
>
(
void
)
>
;
OF_DISALLOW_COPY_AND_MOVE
(
EventRecorder
);
explicit
EventRecorder
(
const
std
::
shared_ptr
<
IEvent
>&
event
)
:
event_
(
event
)
{
CHECK_JUST
(
RegisterEventToProfileManager
(
event
));
event_
->
Start
();
}
Maybe
<
void
>
RegisterEventToProfileManager
(
const
std
::
shared_ptr
<
IEvent
>&
event
);
~
EventRecorder
()
{
if
(
event_
)
{
event_
->
Finish
();
event_
.
reset
();
}
}
static
std
::
shared_ptr
<
EventRecorder
>
CreateCustomEventRecorder
(
const
std
::
string
&
name
);
static
Maybe
<
EventRecorder
>
CreateKernelEventRecorder
(
const
std
::
string
&
name
,
#if defined(WITH_CUDA) || defined(WITH_ROCM)
const
std
::
function
<
int64_t
()
>&
memory_size_getter
,
#endif
const
ShapeGetterFuncType
&
shape_getter
);
private:
std
::
shared_ptr
<
IEvent
>
event_
;
};
}
// namespace profiler
}
// namespace oneflow
#endif // ONEFLOW_CORE_PROFILER_EVENT_RECORDER_H_
/*
Copyright 2020 The OneFlow Authors. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
#ifndef ONEFLOW_CORE_PROFILER_EVENT_RECORDER_H_
#define ONEFLOW_CORE_PROFILER_EVENT_RECORDER_H_
#include "oneflow/core/common/util.h"
#include "oneflow/core/profiler/event.h"
namespace
oneflow
{
namespace
profiler
{
class
EventRecorder
{
public:
using
ShapeGetterFuncType
=
std
::
function
<
std
::
vector
<
Shape
>
(
void
)
>
;
OF_DISALLOW_COPY_AND_MOVE
(
EventRecorder
);
explicit
EventRecorder
(
const
std
::
shared_ptr
<
IEvent
>&
event
)
:
event_
(
event
)
{
CHECK_JUST
(
RegisterEventToProfileManager
(
event
));
event_
->
Start
();
}
Maybe
<
void
>
RegisterEventToProfileManager
(
const
std
::
shared_ptr
<
IEvent
>&
event
);
~
EventRecorder
()
{
if
(
event_
)
{
event_
->
Finish
();
event_
.
reset
();
}
}
static
std
::
shared_ptr
<
EventRecorder
>
CreateCustomEventRecorder
(
const
std
::
string
&
name
);
static
Maybe
<
EventRecorder
>
CreateKernelEventRecorder
(
const
std
::
string
&
name
,
#if defined(WITH_CUDA) || defined(WITH_ROCM)
const
std
::
function
<
int64_t
()
>&
memory_size_getter
,
#endif
const
ShapeGetterFuncType
&
shape_getter
);
private:
std
::
shared_ptr
<
IEvent
>
event_
;
};
}
// namespace profiler
}
// namespace oneflow
#endif // ONEFLOW_CORE_PROFILER_EVENT_RECORDER_H_
oneflow/core/vm/sync_vm_mode_guard.h
View file @
8f7de847
/*
Copyright 2020 The OneFlow Authors. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
#ifndef ONEFLOW_CORE_VM_SYNC_VM_MODE_GUARD_H_
#define ONEFLOW_CORE_VM_SYNC_VM_MODE_GUARD_H_
#include "oneflow/core/common/thread_local_guard.h"
namespace
oneflow
{
enum
class
SyncVmMode
{
kInvalid
=
0
,
kEnable
=
1
,
kDisable
=
2
,
};
class
SyncVmModeGuard
final
:
public
ThreadLocalGuard
<
SyncVmMode
>
{
public:
using
ThreadLocalGuard
<
SyncVmMode
>::
ThreadLocalGuard
;
~
SyncVmModeGuard
()
=
default
;
static
bool
IsCurrentSyncVmMode
()
{
const
auto
&
opt_sync_mode
=
Current
();
return
opt_sync_mode
.
has_value
()
&&
CHECK_JUST
(
opt_sync_mode
)
==
SyncVmMode
::
kEnable
;
}
};
}
// namespace oneflow
/*
Copyright 2020 The OneFlow Authors. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
#ifndef ONEFLOW_CORE_VM_SYNC_VM_MODE_GUARD_H_
#define ONEFLOW_CORE_VM_SYNC_VM_MODE_GUARD_H_
#include "oneflow/core/common/thread_local_guard.h"
namespace
oneflow
{
enum
class
SyncVmMode
{
kInvalid
=
0
,
kEnable
=
1
,
kDisable
=
2
,
};
class
SyncVmModeGuard
final
:
public
ThreadLocalGuard
<
SyncVmMode
>
{
public:
using
ThreadLocalGuard
<
SyncVmMode
>::
ThreadLocalGuard
;
~
SyncVmModeGuard
()
=
default
;
static
bool
IsCurrentSyncVmMode
()
{
const
auto
&
opt_sync_mode
=
Current
();
return
opt_sync_mode
.
has_value
()
&&
CHECK_JUST
(
opt_sync_mode
)
==
SyncVmMode
::
kEnable
;
}
};
}
// namespace oneflow
#endif // ONEFLOW_CORE_VM_SYNC_VM_MODE_GUARD_H_
\ No newline at end of file
oneflow/user/kernels/adaptive_pool_gpu_kernel.hip.cpp
View file @
8f7de847
/*
Copyright 2020 The OneFlow Authors. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
#include "hip/hip_runtime.h"
#include "oneflow/core/device/cuda_util.h"
#include "oneflow/core/framework/framework.h"
#include "oneflow/core/kernel/kernel_util.hip.h"
#include "oneflow/core/common/data_type.h"
#include "oneflow/core/kernel/util/cuda_half_util.h"
#include "oneflow/core/hip/atomic.hip.h"
#include "oneflow/core/operator/operator_util.h"
#include "oneflow/user/utils/pool_util.h"
#include <algorithm>
#include <cfloat>
#include <cmath>
namespace
oneflow
{
namespace
user_op
{
#define START_IND(a, b, c) (int)std::floor((float)(a * c) / b)
#define END_IND(a, b, c) (int)std::ceil((float)((a + 1) * c) / b)
#define START_IND_INT(a, b, c) ((a * c) / b)
#define END_IND_INT(a, b, c) (((a + 1) * c + b - 1) / b)
template
<
typename
T
>
__global__
void
InitPtr
(
int
elements
,
T
*
ptr
)
{
int
gid
=
(
blockDim
.
x
*
blockIdx
.
x
)
+
threadIdx
.
x
;
int
step
=
gridDim
.
x
*
blockDim
.
x
;
while
(
gid
<
elements
)
{
ptr
[
gid
]
=
static_cast
<
T
>
(
0
);
gid
+=
step
;
}
}
inline
Shape
GetShape5D
(
const
Shape
&
shape
,
const
std
::
string
&
data_format
,
int32_t
dim
)
{
FixedDimVector
shape_3d
=
{
GetInDim
(
shape
,
data_format
,
0
,
dim
),
GetInDim
(
shape
,
data_format
,
1
,
dim
),
GetInDim
(
shape
,
data_format
,
2
,
dim
)};
return
Shape
({
shape
.
At
(
0
),
shape
.
At
(
1
),
shape_3d
.
at
(
0
),
shape_3d
.
at
(
1
),
shape_3d
.
at
(
2
)});
}
template
<
typename
T
>
__global__
void
AdaptiveAvgPoolCudaKernel
(
const
T
*
input
,
T
*
output
,
int
num_elems
,
int
in_d
,
int
in_h
,
int
in_w
,
int
out_d
,
int
out_h
,
int
out_w
)
{
const
int
out_panel_size
=
out_d
*
out_h
*
out_w
;
const
int
in_panel_size
=
in_d
*
in_h
*
in_w
;
CUDA_1D_KERNEL_LOOP
(
idx
,
num_elems
)
{
// TODO (Tianyu): Replace following codes with 'NdIndexOffsetHelper'
int
bc_idx
=
idx
/
out_panel_size
;
int
out_d_idx
=
(
idx
%
out_panel_size
)
/
out_w
/
out_h
;
int
out_h_idx
=
(
idx
%
out_panel_size
)
%
(
out_h
*
out_w
)
/
out_w
;
int
out_w_idx
=
(
idx
%
out_panel_size
)
%
(
out_h
*
out_w
)
%
out_w
;
int
in_start_d
=
START_IND
(
out_d_idx
,
out_d
,
in_d
);
int
in_end_d
=
END_IND
(
out_d_idx
,
out_d
,
in_d
);
int
k_d
=
in_end_d
-
in_start_d
;
int
in_start_h
=
START_IND
(
out_h_idx
,
out_h
,
in_h
);
int
in_end_h
=
END_IND
(
out_h_idx
,
out_h
,
in_h
);
int
k_h
=
in_end_h
-
in_start_h
;
int
in_start_w
=
START_IND
(
out_w_idx
,
out_w
,
in_w
);
int
in_end_w
=
END_IND
(
out_w_idx
,
out_w
,
in_w
);
int
k_w
=
in_end_w
-
in_start_w
;
const
T
*
in_ptr
=
input
+
bc_idx
*
in_panel_size
+
in_start_d
*
in_h
*
in_w
+
in_start_h
*
in_w
+
in_start_w
;
T
sum
=
static_cast
<
T
>
(
0
);
for
(
int
id
=
0
;
id
<
k_d
;
++
id
)
{
for
(
int
ih
=
0
;
ih
<
k_h
;
++
ih
)
{
for
(
int
iw
=
0
;
iw
<
k_w
;
++
iw
)
{
T
val
=
*
(
in_ptr
+
ih
*
in_w
+
iw
);
sum
+=
val
;
}
}
in_ptr
+=
in_h
*
in_w
;
// next input depth
}
// Update output
output
[
idx
]
=
sum
/
k_d
/
k_h
/
k_w
;
}
}
template
<
typename
T
>
__global__
void
AdaptiveAvgPoolGradCudaKernel
(
T
*
input
,
const
T
*
output
,
int
num_elems
,
int
in_d
,
int
in_h
,
int
in_w
,
int
out_d
,
int
out_h
,
int
out_w
)
{
const
int
out_panel_size
=
out_d
*
out_h
*
out_w
;
const
int
in_panel_size
=
in_d
*
in_h
*
in_w
;
CUDA_1D_KERNEL_LOOP
(
idx
,
num_elems
)
{
// TODO (Tianyu): Replace following codes with 'NdIndexOffsetHelper'
int
bc_idx
=
idx
/
out_panel_size
;
int
out_d_idx
=
(
idx
%
out_panel_size
)
/
out_w
/
out_h
;
int
out_h_idx
=
(
idx
%
out_panel_size
)
%
(
out_h
*
out_w
)
/
out_w
;
int
out_w_idx
=
(
idx
%
out_panel_size
)
%
(
out_h
*
out_w
)
%
out_w
;
int
in_start_d
=
START_IND
(
out_d_idx
,
out_d
,
in_d
);
int
in_end_d
=
END_IND
(
out_d_idx
,
out_d
,
in_d
);
int
k_d
=
in_end_d
-
in_start_d
;
int
in_start_h
=
START_IND
(
out_h_idx
,
out_h
,
in_h
);
int
in_end_h
=
END_IND
(
out_h_idx
,
out_h
,
in_h
);
int
k_h
=
in_end_h
-
in_start_h
;
int
in_start_w
=
START_IND
(
out_w_idx
,
out_w
,
in_w
);
int
in_end_w
=
END_IND
(
out_w_idx
,
out_w
,
in_w
);
int
k_w
=
in_end_w
-
in_start_w
;
const
T
grad_delta
=
output
[
idx
]
/
k_d
/
k_h
/
k_w
;
T
*
input_ptr
=
input
+
bc_idx
*
in_panel_size
+
in_start_d
*
in_h
*
in_w
+
in_start_h
*
in_w
+
in_start_w
;
for
(
int
id
=
0
;
id
<
k_d
;
++
id
)
{
for
(
int
ih
=
0
;
ih
<
k_h
;
++
ih
)
{
for
(
int
iw
=
0
;
iw
<
k_w
;
++
iw
)
{
// TODO (Tianyu): Use 'atmoic::Add' when necessary
cuda
::
atomic
::
Add
(
input_ptr
+
ih
*
in_w
+
iw
,
grad_delta
);
}
}
input_ptr
+=
in_h
*
in_w
;
// next input depth
}
}
}
template
<
typename
T
>
void
AvgForwardCompute
(
KernelComputeContext
*
ctx
,
const
int32_t
&
dim
)
{
const
Tensor
*
in_tensor
=
ctx
->
Tensor4ArgNameAndIndex
(
"x"
,
0
);
Tensor
*
out_tensor
=
ctx
->
Tensor4ArgNameAndIndex
(
"y"
,
0
);
const
T
*
in_ptr
=
in_tensor
->
dptr
<
T
>
();
T
*
out_ptr
=
out_tensor
->
mut_dptr
<
T
>
();
const
Shape
&
x_shape
=
ctx
->
TensorDesc4ArgNameAndIndex
(
"x"
,
0
)
->
shape
();
const
Shape
&
y_shape
=
ctx
->
TensorDesc4ArgNameAndIndex
(
"y"
,
0
)
->
shape
();
// TODO (Tianyu): Support 'channels_last'
std
::
string
data_format
=
"channels_first"
;
const
Shape
&
in
=
GetShape5D
(
x_shape
,
data_format
,
dim
);
const
Shape
&
out
=
GetShape5D
(
y_shape
,
data_format
,
dim
);
const
int
out_elems
=
out_tensor
->
shape_view
().
elem_cnt
();
RUN_CUDA_KERNEL
((
AdaptiveAvgPoolCudaKernel
<
T
>
),
ctx
->
stream
(),
out_elems
,
in_ptr
,
out_ptr
,
out_elems
,
in
.
At
(
2
),
in
.
At
(
3
),
in
.
At
(
4
),
out
.
At
(
2
),
out
.
At
(
3
),
out
.
At
(
4
));
}
template
<
typename
T
>
void
AvgBackwardCompute
(
KernelComputeContext
*
ctx
,
const
int32_t
&
dim
)
{
const
Tensor
*
out_tensor
=
ctx
->
Tensor4ArgNameAndIndex
(
"dy"
,
0
);
Tensor
*
in_tensor
=
ctx
->
Tensor4ArgNameAndIndex
(
"dx"
,
0
);
const
T
*
out_ptr
=
out_tensor
->
dptr
<
T
>
();
T
*
in_ptr
=
in_tensor
->
mut_dptr
<
T
>
();
const
Shape
&
dx_shape
=
ctx
->
TensorDesc4ArgNameAndIndex
(
"dx"
,
0
)
->
shape
();
const
Shape
&
dy_shape
=
ctx
->
TensorDesc4ArgNameAndIndex
(
"dy"
,
0
)
->
shape
();
// TODO (Tianyu): Support 'channels_last'
std
::
string
data_format
=
"channels_first"
;
const
Shape
&
in
=
GetShape5D
(
dx_shape
,
data_format
,
dim
);
const
Shape
&
out
=
GetShape5D
(
dy_shape
,
data_format
,
dim
);
const
int
in_elems
=
in_tensor
->
shape_view
().
elem_cnt
();
const
int
out_elems
=
out_tensor
->
shape_view
().
elem_cnt
();
RUN_CUDA_KERNEL
((
InitPtr
<
T
>
),
ctx
->
stream
(),
in_elems
,
in_elems
,
in_ptr
);
RUN_CUDA_KERNEL
((
AdaptiveAvgPoolGradCudaKernel
<
T
>
),
ctx
->
stream
(),
out_elems
,
in_ptr
,
out_ptr
,
out_elems
,
in
.
At
(
2
),
in
.
At
(
3
),
in
.
At
(
4
),
out
.
At
(
2
),
out
.
At
(
3
),
out
.
At
(
4
));
}
template
<
DeviceType
device_type
,
typename
T
>
class
GpuAdaptiveAvgPool1dKernel
final
:
public
OpKernel
{
public:
GpuAdaptiveAvgPool1dKernel
()
=
default
;
~
GpuAdaptiveAvgPool1dKernel
()
=
default
;
private:
using
user_op
::
OpKernel
::
Compute
;
void
Compute
(
KernelComputeContext
*
ctx
)
const
override
{
AvgForwardCompute
<
T
>
(
ctx
,
1
);
}
bool
AlwaysComputeWhenAllOutputsEmpty
()
const
override
{
return
false
;
}
};
template
<
DeviceType
device_type
,
typename
T
>
class
GpuAdaptiveAvgPool2dKernel
final
:
public
OpKernel
{
public:
GpuAdaptiveAvgPool2dKernel
()
=
default
;
~
GpuAdaptiveAvgPool2dKernel
()
=
default
;
private:
using
user_op
::
OpKernel
::
Compute
;
void
Compute
(
KernelComputeContext
*
ctx
)
const
override
{
AvgForwardCompute
<
T
>
(
ctx
,
2
);
}
bool
AlwaysComputeWhenAllOutputsEmpty
()
const
override
{
return
false
;
}
};
template
<
DeviceType
device_type
,
typename
T
>
class
GpuAdaptiveAvgPool3dKernel
final
:
public
OpKernel
{
public:
GpuAdaptiveAvgPool3dKernel
()
=
default
;
~
GpuAdaptiveAvgPool3dKernel
()
=
default
;
private:
using
user_op
::
OpKernel
::
Compute
;
void
Compute
(
KernelComputeContext
*
ctx
)
const
override
{
AvgForwardCompute
<
T
>
(
ctx
,
3
);
}
bool
AlwaysComputeWhenAllOutputsEmpty
()
const
override
{
return
false
;
}
};
template
<
DeviceType
device_type
,
typename
T
>
class
GpuAdaptiveAvgPool1dGradKernel
final
:
public
OpKernel
{
public:
GpuAdaptiveAvgPool1dGradKernel
()
=
default
;
~
GpuAdaptiveAvgPool1dGradKernel
()
=
default
;
private:
using
user_op
::
OpKernel
::
Compute
;
void
Compute
(
KernelComputeContext
*
ctx
)
const
override
{
AvgBackwardCompute
<
T
>
(
ctx
,
1
);
}
bool
AlwaysComputeWhenAllOutputsEmpty
()
const
override
{
return
false
;
}
};
template
<
DeviceType
device_type
,
typename
T
>
class
GpuAdaptiveAvgPool2dGradKernel
final
:
public
OpKernel
{
public:
GpuAdaptiveAvgPool2dGradKernel
()
=
default
;
~
GpuAdaptiveAvgPool2dGradKernel
()
=
default
;
private:
using
user_op
::
OpKernel
::
Compute
;
void
Compute
(
KernelComputeContext
*
ctx
)
const
override
{
AvgBackwardCompute
<
T
>
(
ctx
,
2
);
}
bool
AlwaysComputeWhenAllOutputsEmpty
()
const
override
{
return
false
;
}
};
template
<
DeviceType
device_type
,
typename
T
>
class
GpuAdaptiveAvgPool3dGradKernel
final
:
public
OpKernel
{
public:
GpuAdaptiveAvgPool3dGradKernel
()
=
default
;
~
GpuAdaptiveAvgPool3dGradKernel
()
=
default
;
private:
using
user_op
::
OpKernel
::
Compute
;
void
Compute
(
KernelComputeContext
*
ctx
)
const
override
{
AvgBackwardCompute
<
T
>
(
ctx
,
3
);
}
bool
AlwaysComputeWhenAllOutputsEmpty
()
const
override
{
return
false
;
}
};
#define REGISTER_CUDA_ADAPTIVE_AVGPOOL_KERNEL(device, dtype) \
REGISTER_USER_KERNEL
(
"adaptive_avg_pool1d"
)
\
.
SetCreateFn
<
GpuAdaptiveAvgPool1dKernel
<
device
,
dtype
>>
()
\
.
SetIsMatchedHob
((
HobDeviceType
()
==
device
)
\
&&
(
HobDataType
(
"y"
,
0
)
==
GetDataType
<
dtype
>::
value
));
\
REGISTER_USER_KERNEL
(
"adaptive_avg_pool2d"
)
\
.
SetCreateFn
<
GpuAdaptiveAvgPool2dKernel
<
device
,
dtype
>>
()
\
.
SetIsMatchedHob
((
HobDeviceType
()
==
device
)
\
&&
(
HobDataType
(
"y"
,
0
)
==
GetDataType
<
dtype
>::
value
));
\
REGISTER_USER_KERNEL
(
"adaptive_avg_pool3d"
)
\
.
SetCreateFn
<
GpuAdaptiveAvgPool3dKernel
<
device
,
dtype
>>
()
\
.
SetIsMatchedHob
((
HobDeviceType
()
==
device
)
\
&&
(
HobDataType
(
"y"
,
0
)
==
GetDataType
<
dtype
>::
value
));
REGISTER_CUDA_ADAPTIVE_AVGPOOL_KERNEL
(
DeviceType
::
kCUDA
,
float
);
REGISTER_CUDA_ADAPTIVE_AVGPOOL_KERNEL
(
DeviceType
::
kCUDA
,
double
);
REGISTER_CUDA_ADAPTIVE_AVGPOOL_KERNEL
(
DeviceType
::
kCUDA
,
int
);
#define REGISTER_CUDA_ADAPTIVE_AVGPOOL_BACKWARD_KERNEL(device, dtype) \
REGISTER_USER_KERNEL
(
"adaptive_avg_pool1d_grad"
)
\
.
SetCreateFn
<
GpuAdaptiveAvgPool1dGradKernel
<
device
,
dtype
>>
()
\
.
SetIsMatchedHob
((
HobDeviceType
()
==
device
)
\
&&
(
HobDataType
(
"dx"
,
0
)
==
GetDataType
<
dtype
>::
value
));
\
REGISTER_USER_KERNEL
(
"adaptive_avg_pool2d_grad"
)
\
.
SetCreateFn
<
GpuAdaptiveAvgPool2dGradKernel
<
device
,
dtype
>>
()
\
.
SetIsMatchedHob
((
HobDeviceType
()
==
device
)
\
&&
(
HobDataType
(
"dx"
,
0
)
==
GetDataType
<
dtype
>::
value
));
\
REGISTER_USER_KERNEL
(
"adaptive_avg_pool3d_grad"
)
\
.
SetCreateFn
<
GpuAdaptiveAvgPool3dGradKernel
<
device
,
dtype
>>
()
\
.
SetIsMatchedHob
((
HobDeviceType
()
==
device
)
\
&&
(
HobDataType
(
"dx"
,
0
)
==
GetDataType
<
dtype
>::
value
));
REGISTER_CUDA_ADAPTIVE_AVGPOOL_BACKWARD_KERNEL
(
DeviceType
::
kCUDA
,
float
);
REGISTER_CUDA_ADAPTIVE_AVGPOOL_BACKWARD_KERNEL
(
DeviceType
::
kCUDA
,
double
);
REGISTER_CUDA_ADAPTIVE_AVGPOOL_BACKWARD_KERNEL
(
DeviceType
::
kCUDA
,
int
);
}
// namespace user_op
/*
Copyright 2020 The OneFlow Authors. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
#include "hip/hip_runtime.h"
#include "oneflow/core/device/cuda_util.h"
#include "oneflow/core/framework/framework.h"
#include "oneflow/core/kernel/kernel_util.hip.h"
#include "oneflow/core/common/data_type.h"
#include "oneflow/core/kernel/util/cuda_half_util.h"
#include "oneflow/core/hip/atomic.hip.h"
#include "oneflow/core/operator/operator_util.h"
#include "oneflow/user/utils/pool_util.h"
#include <algorithm>
#include <cfloat>
#include <cmath>
namespace
oneflow
{
namespace
user_op
{
#define START_IND(a, b, c) (int)std::floor((float)(a * c) / b)
#define END_IND(a, b, c) (int)std::ceil((float)((a + 1) * c) / b)
#define START_IND_INT(a, b, c) ((a * c) / b)
#define END_IND_INT(a, b, c) (((a + 1) * c + b - 1) / b)
template
<
typename
T
>
__global__
void
InitPtr
(
int
elements
,
T
*
ptr
)
{
int
gid
=
(
blockDim
.
x
*
blockIdx
.
x
)
+
threadIdx
.
x
;
int
step
=
gridDim
.
x
*
blockDim
.
x
;
while
(
gid
<
elements
)
{
ptr
[
gid
]
=
static_cast
<
T
>
(
0
);
gid
+=
step
;
}
}
inline
Shape
GetShape5D
(
const
Shape
&
shape
,
const
std
::
string
&
data_format
,
int32_t
dim
)
{
FixedDimVector
shape_3d
=
{
GetInDim
(
shape
,
data_format
,
0
,
dim
),
GetInDim
(
shape
,
data_format
,
1
,
dim
),
GetInDim
(
shape
,
data_format
,
2
,
dim
)};
return
Shape
({
shape
.
At
(
0
),
shape
.
At
(
1
),
shape_3d
.
at
(
0
),
shape_3d
.
at
(
1
),
shape_3d
.
at
(
2
)});
}
template
<
typename
T
>
__global__
void
AdaptiveAvgPoolCudaKernel
(
const
T
*
input
,
T
*
output
,
int
num_elems
,
int
in_d
,
int
in_h
,
int
in_w
,
int
out_d
,
int
out_h
,
int
out_w
)
{
const
int
out_panel_size
=
out_d
*
out_h
*
out_w
;
const
int
in_panel_size
=
in_d
*
in_h
*
in_w
;
CUDA_1D_KERNEL_LOOP
(
idx
,
num_elems
)
{
// TODO (Tianyu): Replace following codes with 'NdIndexOffsetHelper'
int
bc_idx
=
idx
/
out_panel_size
;
int
out_d_idx
=
(
idx
%
out_panel_size
)
/
out_w
/
out_h
;
int
out_h_idx
=
(
idx
%
out_panel_size
)
%
(
out_h
*
out_w
)
/
out_w
;
int
out_w_idx
=
(
idx
%
out_panel_size
)
%
(
out_h
*
out_w
)
%
out_w
;
int
in_start_d
=
START_IND
(
out_d_idx
,
out_d
,
in_d
);
int
in_end_d
=
END_IND
(
out_d_idx
,
out_d
,
in_d
);
int
k_d
=
in_end_d
-
in_start_d
;
int
in_start_h
=
START_IND
(
out_h_idx
,
out_h
,
in_h
);
int
in_end_h
=
END_IND
(
out_h_idx
,
out_h
,
in_h
);
int
k_h
=
in_end_h
-
in_start_h
;
int
in_start_w
=
START_IND
(
out_w_idx
,
out_w
,
in_w
);
int
in_end_w
=
END_IND
(
out_w_idx
,
out_w
,
in_w
);
int
k_w
=
in_end_w
-
in_start_w
;
const
T
*
in_ptr
=
input
+
bc_idx
*
in_panel_size
+
in_start_d
*
in_h
*
in_w
+
in_start_h
*
in_w
+
in_start_w
;
T
sum
=
static_cast
<
T
>
(
0
);
for
(
int
id
=
0
;
id
<
k_d
;
++
id
)
{
for
(
int
ih
=
0
;
ih
<
k_h
;
++
ih
)
{
for
(
int
iw
=
0
;
iw
<
k_w
;
++
iw
)
{
T
val
=
*
(
in_ptr
+
ih
*
in_w
+
iw
);
sum
+=
val
;
}
}
in_ptr
+=
in_h
*
in_w
;
// next input depth
}
// Update output
output
[
idx
]
=
sum
/
k_d
/
k_h
/
k_w
;
}
}
template
<
typename
T
>
__global__
void
AdaptiveAvgPoolGradCudaKernel
(
T
*
input
,
const
T
*
output
,
int
num_elems
,
int
in_d
,
int
in_h
,
int
in_w
,
int
out_d
,
int
out_h
,
int
out_w
)
{
const
int
out_panel_size
=
out_d
*
out_h
*
out_w
;
const
int
in_panel_size
=
in_d
*
in_h
*
in_w
;
CUDA_1D_KERNEL_LOOP
(
idx
,
num_elems
)
{
// TODO (Tianyu): Replace following codes with 'NdIndexOffsetHelper'
int
bc_idx
=
idx
/
out_panel_size
;
int
out_d_idx
=
(
idx
%
out_panel_size
)
/
out_w
/
out_h
;
int
out_h_idx
=
(
idx
%
out_panel_size
)
%
(
out_h
*
out_w
)
/
out_w
;
int
out_w_idx
=
(
idx
%
out_panel_size
)
%
(
out_h
*
out_w
)
%
out_w
;
int
in_start_d
=
START_IND
(
out_d_idx
,
out_d
,
in_d
);
int
in_end_d
=
END_IND
(
out_d_idx
,
out_d
,
in_d
);
int
k_d
=
in_end_d
-
in_start_d
;
int
in_start_h
=
START_IND
(
out_h_idx
,
out_h
,
in_h
);
int
in_end_h
=
END_IND
(
out_h_idx
,
out_h
,
in_h
);
int
k_h
=
in_end_h
-
in_start_h
;
int
in_start_w
=
START_IND
(
out_w_idx
,
out_w
,
in_w
);
int
in_end_w
=
END_IND
(
out_w_idx
,
out_w
,
in_w
);
int
k_w
=
in_end_w
-
in_start_w
;
const
T
grad_delta
=
output
[
idx
]
/
k_d
/
k_h
/
k_w
;
T
*
input_ptr
=
input
+
bc_idx
*
in_panel_size
+
in_start_d
*
in_h
*
in_w
+
in_start_h
*
in_w
+
in_start_w
;
for
(
int
id
=
0
;
id
<
k_d
;
++
id
)
{
for
(
int
ih
=
0
;
ih
<
k_h
;
++
ih
)
{
for
(
int
iw
=
0
;
iw
<
k_w
;
++
iw
)
{
// TODO (Tianyu): Use 'atmoic::Add' when necessary
cuda
::
atomic
::
Add
(
input_ptr
+
ih
*
in_w
+
iw
,
grad_delta
);
}
}
input_ptr
+=
in_h
*
in_w
;
// next input depth
}
}
}
template
<
typename
T
>
void
AvgForwardCompute
(
KernelComputeContext
*
ctx
,
const
int32_t
&
dim
)
{
const
Tensor
*
in_tensor
=
ctx
->
Tensor4ArgNameAndIndex
(
"x"
,
0
);
Tensor
*
out_tensor
=
ctx
->
Tensor4ArgNameAndIndex
(
"y"
,
0
);
const
T
*
in_ptr
=
in_tensor
->
dptr
<
T
>
();
T
*
out_ptr
=
out_tensor
->
mut_dptr
<
T
>
();
const
Shape
&
x_shape
=
ctx
->
TensorDesc4ArgNameAndIndex
(
"x"
,
0
)
->
shape
();
const
Shape
&
y_shape
=
ctx
->
TensorDesc4ArgNameAndIndex
(
"y"
,
0
)
->
shape
();
// TODO (Tianyu): Support 'channels_last'
std
::
string
data_format
=
"channels_first"
;
const
Shape
&
in
=
GetShape5D
(
x_shape
,
data_format
,
dim
);
const
Shape
&
out
=
GetShape5D
(
y_shape
,
data_format
,
dim
);
const
int
out_elems
=
out_tensor
->
shape_view
().
elem_cnt
();
RUN_CUDA_KERNEL
((
AdaptiveAvgPoolCudaKernel
<
T
>
),
ctx
->
stream
(),
out_elems
,
in_ptr
,
out_ptr
,
out_elems
,
in
.
At
(
2
),
in
.
At
(
3
),
in
.
At
(
4
),
out
.
At
(
2
),
out
.
At
(
3
),
out
.
At
(
4
));
}
template
<
typename
T
>
void
AvgBackwardCompute
(
KernelComputeContext
*
ctx
,
const
int32_t
&
dim
)
{
const
Tensor
*
out_tensor
=
ctx
->
Tensor4ArgNameAndIndex
(
"dy"
,
0
);
Tensor
*
in_tensor
=
ctx
->
Tensor4ArgNameAndIndex
(
"dx"
,
0
);
const
T
*
out_ptr
=
out_tensor
->
dptr
<
T
>
();
T
*
in_ptr
=
in_tensor
->
mut_dptr
<
T
>
();
const
Shape
&
dx_shape
=
ctx
->
TensorDesc4ArgNameAndIndex
(
"dx"
,
0
)
->
shape
();
const
Shape
&
dy_shape
=
ctx
->
TensorDesc4ArgNameAndIndex
(
"dy"
,
0
)
->
shape
();
// TODO (Tianyu): Support 'channels_last'
std
::
string
data_format
=
"channels_first"
;
const
Shape
&
in
=
GetShape5D
(
dx_shape
,
data_format
,
dim
);
const
Shape
&
out
=
GetShape5D
(
dy_shape
,
data_format
,
dim
);
const
int
in_elems
=
in_tensor
->
shape_view
().
elem_cnt
();
const
int
out_elems
=
out_tensor
->
shape_view
().
elem_cnt
();
RUN_CUDA_KERNEL
((
InitPtr
<
T
>
),
ctx
->
stream
(),
in_elems
,
in_elems
,
in_ptr
);
RUN_CUDA_KERNEL
((
AdaptiveAvgPoolGradCudaKernel
<
T
>
),
ctx
->
stream
(),
out_elems
,
in_ptr
,
out_ptr
,
out_elems
,
in
.
At
(
2
),
in
.
At
(
3
),
in
.
At
(
4
),
out
.
At
(
2
),
out
.
At
(
3
),
out
.
At
(
4
));
}
template
<
DeviceType
device_type
,
typename
T
>
class
GpuAdaptiveAvgPool1dKernel
final
:
public
OpKernel
{
public:
GpuAdaptiveAvgPool1dKernel
()
=
default
;
~
GpuAdaptiveAvgPool1dKernel
()
=
default
;
private:
using
user_op
::
OpKernel
::
Compute
;
void
Compute
(
KernelComputeContext
*
ctx
)
const
override
{
AvgForwardCompute
<
T
>
(
ctx
,
1
);
}
bool
AlwaysComputeWhenAllOutputsEmpty
()
const
override
{
return
false
;
}
};
template
<
DeviceType
device_type
,
typename
T
>
class
GpuAdaptiveAvgPool2dKernel
final
:
public
OpKernel
{
public:
GpuAdaptiveAvgPool2dKernel
()
=
default
;
~
GpuAdaptiveAvgPool2dKernel
()
=
default
;
private:
using
user_op
::
OpKernel
::
Compute
;
void
Compute
(
KernelComputeContext
*
ctx
)
const
override
{
AvgForwardCompute
<
T
>
(
ctx
,
2
);
}
bool
AlwaysComputeWhenAllOutputsEmpty
()
const
override
{
return
false
;
}
};
template
<
DeviceType
device_type
,
typename
T
>
class
GpuAdaptiveAvgPool3dKernel
final
:
public
OpKernel
{
public:
GpuAdaptiveAvgPool3dKernel
()
=
default
;
~
GpuAdaptiveAvgPool3dKernel
()
=
default
;
private:
using
user_op
::
OpKernel
::
Compute
;
void
Compute
(
KernelComputeContext
*
ctx
)
const
override
{
AvgForwardCompute
<
T
>
(
ctx
,
3
);
}
bool
AlwaysComputeWhenAllOutputsEmpty
()
const
override
{
return
false
;
}
};
template
<
DeviceType
device_type
,
typename
T
>
class
GpuAdaptiveAvgPool1dGradKernel
final
:
public
OpKernel
{
public:
GpuAdaptiveAvgPool1dGradKernel
()
=
default
;
~
GpuAdaptiveAvgPool1dGradKernel
()
=
default
;
private:
using
user_op
::
OpKernel
::
Compute
;
void
Compute
(
KernelComputeContext
*
ctx
)
const
override
{
AvgBackwardCompute
<
T
>
(
ctx
,
1
);
}
bool
AlwaysComputeWhenAllOutputsEmpty
()
const
override
{
return
false
;
}
};
template
<
DeviceType
device_type
,
typename
T
>
class
GpuAdaptiveAvgPool2dGradKernel
final
:
public
OpKernel
{
public:
GpuAdaptiveAvgPool2dGradKernel
()
=
default
;
~
GpuAdaptiveAvgPool2dGradKernel
()
=
default
;
private:
using
user_op
::
OpKernel
::
Compute
;
void
Compute
(
KernelComputeContext
*
ctx
)
const
override
{
AvgBackwardCompute
<
T
>
(
ctx
,
2
);
}
bool
AlwaysComputeWhenAllOutputsEmpty
()
const
override
{
return
false
;
}
};
template
<
DeviceType
device_type
,
typename
T
>
class
GpuAdaptiveAvgPool3dGradKernel
final
:
public
OpKernel
{
public:
GpuAdaptiveAvgPool3dGradKernel
()
=
default
;
~
GpuAdaptiveAvgPool3dGradKernel
()
=
default
;
private:
using
user_op
::
OpKernel
::
Compute
;
void
Compute
(
KernelComputeContext
*
ctx
)
const
override
{
AvgBackwardCompute
<
T
>
(
ctx
,
3
);
}
bool
AlwaysComputeWhenAllOutputsEmpty
()
const
override
{
return
false
;
}
};
#define REGISTER_CUDA_ADAPTIVE_AVGPOOL_KERNEL(device, dtype) \
REGISTER_USER_KERNEL("adaptive_avg_pool1d") \
.SetCreateFn<GpuAdaptiveAvgPool1dKernel<device, dtype>>() \
.SetIsMatchedHob((HobDeviceType() == device) \
&& (HobDataType("y", 0) == GetDataType<dtype>::value)); \
REGISTER_USER_KERNEL("adaptive_avg_pool2d") \
.SetCreateFn<GpuAdaptiveAvgPool2dKernel<device, dtype>>() \
.SetIsMatchedHob((HobDeviceType() == device) \
&& (HobDataType("y", 0) == GetDataType<dtype>::value)); \
REGISTER_USER_KERNEL("adaptive_avg_pool3d") \
.SetCreateFn<GpuAdaptiveAvgPool3dKernel<device, dtype>>() \
.SetIsMatchedHob((HobDeviceType() == device) \
&& (HobDataType("y", 0) == GetDataType<dtype>::value));
REGISTER_CUDA_ADAPTIVE_AVGPOOL_KERNEL
(
DeviceType
::
kCUDA
,
float
);
REGISTER_CUDA_ADAPTIVE_AVGPOOL_KERNEL
(
DeviceType
::
kCUDA
,
double
);
REGISTER_CUDA_ADAPTIVE_AVGPOOL_KERNEL
(
DeviceType
::
kCUDA
,
int
);
#define REGISTER_CUDA_ADAPTIVE_AVGPOOL_BACKWARD_KERNEL(device, dtype) \
REGISTER_USER_KERNEL("adaptive_avg_pool1d_grad") \
.SetCreateFn<GpuAdaptiveAvgPool1dGradKernel<device, dtype>>() \
.SetIsMatchedHob((HobDeviceType() == device) \
&& (HobDataType("dx", 0) == GetDataType<dtype>::value)); \
REGISTER_USER_KERNEL("adaptive_avg_pool2d_grad") \
.SetCreateFn<GpuAdaptiveAvgPool2dGradKernel<device, dtype>>() \
.SetIsMatchedHob((HobDeviceType() == device) \
&& (HobDataType("dx", 0) == GetDataType<dtype>::value)); \
REGISTER_USER_KERNEL("adaptive_avg_pool3d_grad") \
.SetCreateFn<GpuAdaptiveAvgPool3dGradKernel<device, dtype>>() \
.SetIsMatchedHob((HobDeviceType() == device) \
&& (HobDataType("dx", 0) == GetDataType<dtype>::value));
REGISTER_CUDA_ADAPTIVE_AVGPOOL_BACKWARD_KERNEL
(
DeviceType
::
kCUDA
,
float
);
REGISTER_CUDA_ADAPTIVE_AVGPOOL_BACKWARD_KERNEL
(
DeviceType
::
kCUDA
,
double
);
REGISTER_CUDA_ADAPTIVE_AVGPOOL_BACKWARD_KERNEL
(
DeviceType
::
kCUDA
,
int
);
}
// namespace user_op
}
// namespace oneflow
\ No newline at end of file
oneflow/user/kernels/affine_grid_kernel.hip.cpp
View file @
8f7de847
/*
Copyright 2020 The OneFlow Authors. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
#include "hip/hip_runtime.h"
#include "oneflow/core/kernel/new_kernel_util.h"
#include "oneflow/core/kernel/kernel_util.h"
#include "oneflow/core/device/cuda_util.h"
#include "affine_grid_kernel.h"
namespace
oneflow
{
namespace
{
template
<
typename
data_type
,
bool
align_corners
>
OF_DEVICE_FUNC
data_type
LinspaceGPU
(
int32_t
index
,
int32_t
num_steps
)
{
if
(
num_steps
<=
1
)
{
return
static_cast
<
data_type
>
(
0.0
);
}
if
(
align_corners
)
{
return
static_cast
<
data_type
>
(
-
1.0
+
2.0
/
(
num_steps
-
1
)
*
index
);
}
else
{
return
static_cast
<
data_type
>
((
-
1.0
+
2.0
/
(
num_steps
-
1
)
*
index
)
*
(
num_steps
-
1
)
/
num_steps
);
}
}
template
<
typename
data_type
,
bool
align_corners
>
__global__
void
Generate2DBaseGridGPUKernel
(
const
int32_t
nthreads
,
data_type
*
grid_ptr
,
int32_t
H
,
int32_t
W
)
{
CUDA_1D_KERNEL_LOOP
(
index
,
nthreads
)
{
const
int32_t
h
=
index
/
W
;
const
int32_t
w
=
index
%
W
;
const
int32_t
pixel_length
=
3
;
data_type
*
row_ptr
=
grid_ptr
+
h
*
W
*
pixel_length
;
data_type
*
pixel_ptr
=
row_ptr
+
w
*
pixel_length
;
data_type
h_value
=
LinspaceGPU
<
data_type
,
align_corners
>
(
h
,
H
);
data_type
w_value
=
LinspaceGPU
<
data_type
,
align_corners
>
(
w
,
W
);
pixel_ptr
[
0
]
=
w_value
;
pixel_ptr
[
1
]
=
h_value
;
pixel_ptr
[
2
]
=
static_cast
<
data_type
>
(
1.0
);
}
}
template
<
typename
data_type
,
bool
align_corners
>
__global__
void
Generate3DBaseGridGPUKernel
(
const
int32_t
nthreads
,
data_type
*
grid_ptr
,
int32_t
D
,
int32_t
H
,
int32_t
W
)
{
CUDA_1D_KERNEL_LOOP
(
index
,
nthreads
)
{
const
int32_t
d
=
index
/
H
;
const
int32_t
h
=
index
%
H
;
const
int32_t
pixel_length
=
4
;
data_type
*
image_ptr
=
grid_ptr
+
d
*
H
*
W
*
pixel_length
;
data_type
*
row_ptr
=
image_ptr
+
h
*
W
*
pixel_length
;
data_type
d_value
=
LinspaceGPU
<
data_type
,
align_corners
>
(
d
,
D
);
data_type
h_value
=
LinspaceGPU
<
data_type
,
align_corners
>
(
h
,
H
);
for
(
int32_t
w
=
0
;
w
<
W
;
++
w
)
{
data_type
*
pixel_ptr
=
row_ptr
+
w
*
pixel_length
;
data_type
w_value
=
LinspaceGPU
<
data_type
,
align_corners
>
(
w
,
W
);
pixel_ptr
[
0
]
=
w_value
;
pixel_ptr
[
1
]
=
h_value
;
pixel_ptr
[
2
]
=
d_value
;
pixel_ptr
[
3
]
=
static_cast
<
data_type
>
(
1.0
);
}
}
}
}
// namespace
void
GenerateBaseGridImp
<
DeviceType
::
kCUDA
>::
Generate2D
(
user_op
::
KernelComputeContext
*
ctx
,
float
*
grid_ptr
,
int64_t
H
,
int64_t
W
,
bool
align_corners
)
{
int
count
=
H
*
W
;
if
(
align_corners
)
{
RUN_CUDA_KERNEL
((
Generate2DBaseGridGPUKernel
<
float
,
true
>
),
ctx
->
stream
(),
count
,
count
,
grid_ptr
,
H
,
W
);
}
else
{
RUN_CUDA_KERNEL
((
Generate2DBaseGridGPUKernel
<
float
,
false
>
),
ctx
->
stream
(),
count
,
count
,
grid_ptr
,
H
,
W
);
}
}
void
GenerateBaseGridImp
<
DeviceType
::
kCUDA
>::
Generate2D
(
user_op
::
KernelComputeContext
*
ctx
,
double
*
grid_ptr
,
int64_t
H
,
int64_t
W
,
bool
align_corners
)
{
int
count
=
H
*
W
;
if
(
align_corners
)
{
RUN_CUDA_KERNEL
((
Generate2DBaseGridGPUKernel
<
double
,
true
>
),
ctx
->
stream
(),
count
,
count
,
grid_ptr
,
H
,
W
);
}
else
{
RUN_CUDA_KERNEL
((
Generate2DBaseGridGPUKernel
<
double
,
false
>
),
ctx
->
stream
(),
count
,
count
,
grid_ptr
,
H
,
W
);
}
}
void
GenerateBaseGridImp
<
DeviceType
::
kCUDA
>::
Generate3D
(
user_op
::
KernelComputeContext
*
ctx
,
float
*
grid_ptr
,
int64_t
D
,
int64_t
H
,
int64_t
W
,
bool
align_corners
)
{
int
count
=
D
*
H
;
if
(
align_corners
)
{
RUN_CUDA_KERNEL
((
Generate3DBaseGridGPUKernel
<
float
,
true
>
),
ctx
->
stream
(),
count
,
count
,
grid_ptr
,
D
,
H
,
W
);
}
else
{
RUN_CUDA_KERNEL
((
Generate3DBaseGridGPUKernel
<
float
,
false
>
),
ctx
->
stream
(),
count
,
count
,
grid_ptr
,
D
,
H
,
W
);
}
}
void
GenerateBaseGridImp
<
DeviceType
::
kCUDA
>::
Generate3D
(
user_op
::
KernelComputeContext
*
ctx
,
double
*
grid_ptr
,
int64_t
D
,
int64_t
H
,
int64_t
W
,
bool
align_corners
)
{
int
count
=
D
*
H
;
if
(
align_corners
)
{
RUN_CUDA_KERNEL
((
Generate3DBaseGridGPUKernel
<
double
,
true
>
),
ctx
->
stream
(),
count
,
count
,
grid_ptr
,
D
,
H
,
W
);
}
else
{
RUN_CUDA_KERNEL
((
Generate3DBaseGridGPUKernel
<
double
,
false
>
),
ctx
->
stream
(),
count
,
count
,
grid_ptr
,
D
,
H
,
W
);
}
}
/*
Copyright 2020 The OneFlow Authors. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
#include "hip/hip_runtime.h"
#include "oneflow/core/kernel/new_kernel_util.h"
#include "oneflow/core/kernel/kernel_util.h"
#include "oneflow/core/device/cuda_util.h"
#include "affine_grid_kernel.h"
namespace
oneflow
{
namespace
{
template
<
typename
data_type
,
bool
align_corners
>
OF_DEVICE_FUNC
data_type
LinspaceGPU
(
int32_t
index
,
int32_t
num_steps
)
{
if
(
num_steps
<=
1
)
{
return
static_cast
<
data_type
>
(
0.0
);
}
if
(
align_corners
)
{
return
static_cast
<
data_type
>
(
-
1.0
+
2.0
/
(
num_steps
-
1
)
*
index
);
}
else
{
return
static_cast
<
data_type
>
((
-
1.0
+
2.0
/
(
num_steps
-
1
)
*
index
)
*
(
num_steps
-
1
)
/
num_steps
);
}
}
template
<
typename
data_type
,
bool
align_corners
>
__global__
void
Generate2DBaseGridGPUKernel
(
const
int32_t
nthreads
,
data_type
*
grid_ptr
,
int32_t
H
,
int32_t
W
)
{
CUDA_1D_KERNEL_LOOP
(
index
,
nthreads
)
{
const
int32_t
h
=
index
/
W
;
const
int32_t
w
=
index
%
W
;
const
int32_t
pixel_length
=
3
;
data_type
*
row_ptr
=
grid_ptr
+
h
*
W
*
pixel_length
;
data_type
*
pixel_ptr
=
row_ptr
+
w
*
pixel_length
;
data_type
h_value
=
LinspaceGPU
<
data_type
,
align_corners
>
(
h
,
H
);
data_type
w_value
=
LinspaceGPU
<
data_type
,
align_corners
>
(
w
,
W
);
pixel_ptr
[
0
]
=
w_value
;
pixel_ptr
[
1
]
=
h_value
;
pixel_ptr
[
2
]
=
static_cast
<
data_type
>
(
1.0
);
}
}
template
<
typename
data_type
,
bool
align_corners
>
__global__
void
Generate3DBaseGridGPUKernel
(
const
int32_t
nthreads
,
data_type
*
grid_ptr
,
int32_t
D
,
int32_t
H
,
int32_t
W
)
{
CUDA_1D_KERNEL_LOOP
(
index
,
nthreads
)
{
const
int32_t
d
=
index
/
H
;
const
int32_t
h
=
index
%
H
;
const
int32_t
pixel_length
=
4
;
data_type
*
image_ptr
=
grid_ptr
+
d
*
H
*
W
*
pixel_length
;
data_type
*
row_ptr
=
image_ptr
+
h
*
W
*
pixel_length
;
data_type
d_value
=
LinspaceGPU
<
data_type
,
align_corners
>
(
d
,
D
);
data_type
h_value
=
LinspaceGPU
<
data_type
,
align_corners
>
(
h
,
H
);
for
(
int32_t
w
=
0
;
w
<
W
;
++
w
)
{
data_type
*
pixel_ptr
=
row_ptr
+
w
*
pixel_length
;
data_type
w_value
=
LinspaceGPU
<
data_type
,
align_corners
>
(
w
,
W
);
pixel_ptr
[
0
]
=
w_value
;
pixel_ptr
[
1
]
=
h_value
;
pixel_ptr
[
2
]
=
d_value
;
pixel_ptr
[
3
]
=
static_cast
<
data_type
>
(
1.0
);
}
}
}
}
// namespace
void
GenerateBaseGridImp
<
DeviceType
::
kCUDA
>::
Generate2D
(
user_op
::
KernelComputeContext
*
ctx
,
float
*
grid_ptr
,
int64_t
H
,
int64_t
W
,
bool
align_corners
)
{
int
count
=
H
*
W
;
if
(
align_corners
)
{
RUN_CUDA_KERNEL
((
Generate2DBaseGridGPUKernel
<
float
,
true
>
),
ctx
->
stream
(),
count
,
count
,
grid_ptr
,
H
,
W
);
}
else
{
RUN_CUDA_KERNEL
((
Generate2DBaseGridGPUKernel
<
float
,
false
>
),
ctx
->
stream
(),
count
,
count
,
grid_ptr
,
H
,
W
);
}
}
void
GenerateBaseGridImp
<
DeviceType
::
kCUDA
>::
Generate2D
(
user_op
::
KernelComputeContext
*
ctx
,
double
*
grid_ptr
,
int64_t
H
,
int64_t
W
,
bool
align_corners
)
{
int
count
=
H
*
W
;
if
(
align_corners
)
{
RUN_CUDA_KERNEL
((
Generate2DBaseGridGPUKernel
<
double
,
true
>
),
ctx
->
stream
(),
count
,
count
,
grid_ptr
,
H
,
W
);
}
else
{
RUN_CUDA_KERNEL
((
Generate2DBaseGridGPUKernel
<
double
,
false
>
),
ctx
->
stream
(),
count
,
count
,
grid_ptr
,
H
,
W
);
}
}
void
GenerateBaseGridImp
<
DeviceType
::
kCUDA
>::
Generate3D
(
user_op
::
KernelComputeContext
*
ctx
,
float
*
grid_ptr
,
int64_t
D
,
int64_t
H
,
int64_t
W
,
bool
align_corners
)
{
int
count
=
D
*
H
;
if
(
align_corners
)
{
RUN_CUDA_KERNEL
((
Generate3DBaseGridGPUKernel
<
float
,
true
>
),
ctx
->
stream
(),
count
,
count
,
grid_ptr
,
D
,
H
,
W
);
}
else
{
RUN_CUDA_KERNEL
((
Generate3DBaseGridGPUKernel
<
float
,
false
>
),
ctx
->
stream
(),
count
,
count
,
grid_ptr
,
D
,
H
,
W
);
}
}
void
GenerateBaseGridImp
<
DeviceType
::
kCUDA
>::
Generate3D
(
user_op
::
KernelComputeContext
*
ctx
,
double
*
grid_ptr
,
int64_t
D
,
int64_t
H
,
int64_t
W
,
bool
align_corners
)
{
int
count
=
D
*
H
;
if
(
align_corners
)
{
RUN_CUDA_KERNEL
((
Generate3DBaseGridGPUKernel
<
double
,
true
>
),
ctx
->
stream
(),
count
,
count
,
grid_ptr
,
D
,
H
,
W
);
}
else
{
RUN_CUDA_KERNEL
((
Generate3DBaseGridGPUKernel
<
double
,
false
>
),
ctx
->
stream
(),
count
,
count
,
grid_ptr
,
D
,
H
,
W
);
}
}
}
// namespace oneflow
\ No newline at end of file
oneflow/user/kernels/arange_kernel_util.hip.cpp
View file @
8f7de847
/*
Copyright 2020 The OneFlow Authors. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
#ifdef WITH_ROCM
#include "hip/hip_runtime.h"
#include "oneflow/core/framework/framework.h"
#include "oneflow/user/kernels/arange_kernel_util.h"
namespace
oneflow
{
namespace
user_op
{
template
<
typename
T
>
__global__
void
ArangeForwardGpuKernel
(
const
T
start
,
const
T
delta
,
const
int64_t
arange_elem_cnt
,
T
*
out
)
{
// Use Loop to set the value
DoArange
<
T
>
(
start
,
delta
,
arange_elem_cnt
,
out
);
}
template
<
typename
T
>
struct
ArangeFunctor
<
DeviceType
::
kCUDA
,
T
>
final
{
void
operator
()(
ep
::
Stream
*
stream
,
const
T
start
,
const
T
delta
,
const
int64_t
arange_elem_cnt
,
T
*
out
)
{
// The thread num is set as arange_elem_cnt
RUN_CUDA_KERNEL
((
ArangeForwardGpuKernel
<
T
>
),
stream
,
arange_elem_cnt
,
start
,
delta
,
arange_elem_cnt
,
out
);
}
};
OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE
(
INSTANTIATE_ARANGE_FUNCTOR
,
(
DeviceType
::
kCUDA
),
ARANGE_DATA_TYPE_SEQ
);
}
// namespace user_op
}
// namespace oneflow
/*
Copyright 2020 The OneFlow Authors. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
#ifdef WITH_ROCM
#include "hip/hip_runtime.h"
#include "oneflow/core/framework/framework.h"
#include "oneflow/user/kernels/arange_kernel_util.h"
namespace
oneflow
{
namespace
user_op
{
template
<
typename
T
>
__global__
void
ArangeForwardGpuKernel
(
const
T
start
,
const
T
delta
,
const
int64_t
arange_elem_cnt
,
T
*
out
)
{
// Use Loop to set the value
DoArange
<
T
>
(
start
,
delta
,
arange_elem_cnt
,
out
);
}
template
<
typename
T
>
struct
ArangeFunctor
<
DeviceType
::
kCUDA
,
T
>
final
{
void
operator
()(
ep
::
Stream
*
stream
,
const
T
start
,
const
T
delta
,
const
int64_t
arange_elem_cnt
,
T
*
out
)
{
// The thread num is set as arange_elem_cnt
RUN_CUDA_KERNEL
((
ArangeForwardGpuKernel
<
T
>
),
stream
,
arange_elem_cnt
,
start
,
delta
,
arange_elem_cnt
,
out
);
}
};
OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE
(
INSTANTIATE_ARANGE_FUNCTOR
,
(
DeviceType
::
kCUDA
),
ARANGE_DATA_TYPE_SEQ
);
}
// namespace user_op
}
// namespace oneflow
#endif // End WITH_ROCM
\ No newline at end of file
oneflow/user/kernels/arg_sort_kernel.hip.cpp
View file @
8f7de847
/*
Copyright 2020 The OneFlow Authors. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
#include "hip/hip_runtime.h"
#include "oneflow/core/framework/framework.h"
#include "oneflow/core/kernel/new_kernel_util.h"
#include "oneflow/user/kernels/radix_sort.hip.h"
#include "oneflow/core/ep/rocm/cuda_stream.h"
namespace
oneflow
{
namespace
{
template
<
typename
T
>
class
TmpBufferManager
final
{
public:
OF_DISALLOW_COPY_AND_MOVE
(
TmpBufferManager
);
TmpBufferManager
(
int32_t
capacity
,
void
*
ptr
,
const
ShapeView
&
in_shape
)
:
capacity_
{
capacity
},
sorted_in_elem_cnt_
{
in_shape
.
elem_cnt
()},
indices_elem_cnt_
{
sorted_in_elem_cnt_
}
{
const
int32_t
sorted_in_aligned_bytes
=
GetCudaAlignedSize
(
sorted_in_elem_cnt_
*
sizeof
(
T
));
const
int32_t
indices_aligned_bytes
=
GetCudaAlignedSize
(
indices_elem_cnt_
*
sizeof
(
int32_t
));
sorted_in_ptr_
=
reinterpret_cast
<
T
*>
(
ptr
);
indices_ptr_
=
reinterpret_cast
<
int32_t
*>
(
reinterpret_cast
<
char
*>
(
sorted_in_ptr_
)
+
sorted_in_aligned_bytes
);
temp_storage_ptr_
=
reinterpret_cast
<
void
*>
(
reinterpret_cast
<
char
*>
(
indices_ptr_
)
+
indices_aligned_bytes
);
temp_storage_bytes_
=
capacity_
-
sorted_in_aligned_bytes
-
indices_aligned_bytes
;
CHECK_GE
(
temp_storage_bytes_
,
0
);
}
~
TmpBufferManager
()
=
default
;
T
*
SortedInPtr
()
const
{
return
sorted_in_ptr_
;
}
int32_t
*
IndicesPtr
()
const
{
return
indices_ptr_
;
}
void
*
TempStoragePtr
()
const
{
return
temp_storage_ptr_
;
}
int32_t
TempStorageBytes
()
const
{
return
temp_storage_bytes_
;
}
private:
int32_t
capacity_
;
T
*
sorted_in_ptr_
;
int32_t
*
indices_ptr_
;
void
*
temp_storage_ptr_
;
int64_t
sorted_in_elem_cnt_
;
int64_t
indices_elem_cnt_
;
int32_t
temp_storage_bytes_
;
};
__global__
void
InitializeIndices
(
int32_t
elem_cnt
,
int32_t
*
indices_ptr
,
int32_t
instance_size
)
{
CUDA_1D_KERNEL_LOOP
(
i
,
elem_cnt
)
{
indices_ptr
[
i
]
=
i
%
instance_size
;
};
}
}
// namespace
template
<
typename
T
>
class
GpuArgSortKernel
final
:
public
user_op
::
OpKernel
{
public:
GpuArgSortKernel
()
=
default
;
~
GpuArgSortKernel
()
=
default
;
private:
using
user_op
::
OpKernel
::
Compute
;
void
Compute
(
user_op
::
KernelComputeContext
*
ctx
)
const
override
{
const
user_op
::
Tensor
*
in
=
ctx
->
Tensor4ArgNameAndIndex
(
"in"
,
0
);
user_op
::
Tensor
*
out
=
ctx
->
Tensor4ArgNameAndIndex
(
"out"
,
0
);
user_op
::
Tensor
*
tmp_buffer
=
ctx
->
Tensor4ArgNameAndIndex
(
"tmp_buffer"
,
0
);
TmpBufferManager
<
T
>
buf_manager
(
static_cast
<
int32_t
>
(
tmp_buffer
->
shape_view
().
elem_cnt
()),
tmp_buffer
->
mut_dptr
<
void
>
(),
in
->
shape_view
());
const
int32_t
elem_cnt
=
in
->
shape_view
().
elem_cnt
();
const
int32_t
instance_size
=
in
->
shape_view
().
At
(
in
->
shape_view
().
NumAxes
()
-
1
);
const
int32_t
instance_num
=
elem_cnt
/
instance_size
;
const
std
::
string
&
direction
=
ctx
->
Attr
<
std
::
string
>
(
"direction"
);
InitializeIndices
<<<
BlocksNum4ThreadsNum
(
elem_cnt
),
kCudaThreadsNumPerBlock
,
0
,
ctx
->
stream
()
->
As
<
ep
::
CudaStream
>
()
->
cuda_stream
()
>>>
(
elem_cnt
,
buf_manager
.
IndicesPtr
(),
instance_size
);
if
(
direction
==
"ASCENDING"
)
{
SortPairsAscending
(
in
->
dptr
<
T
>
(),
buf_manager
.
IndicesPtr
(),
instance_num
,
instance_size
,
buf_manager
.
TempStoragePtr
(),
buf_manager
.
TempStorageBytes
(),
buf_manager
.
SortedInPtr
(),
out
->
mut_dptr
<
int32_t
>
(),
ctx
->
stream
()
->
As
<
ep
::
CudaStream
>
()
->
cuda_stream
());
}
else
if
(
direction
==
"DESCENDING"
)
{
SortPairsDescending
(
in
->
dptr
<
T
>
(),
buf_manager
.
IndicesPtr
(),
instance_num
,
instance_size
,
buf_manager
.
TempStoragePtr
(),
buf_manager
.
TempStorageBytes
(),
buf_manager
.
SortedInPtr
(),
out
->
mut_dptr
<
int32_t
>
(),
ctx
->
stream
()
->
As
<
ep
::
CudaStream
>
()
->
cuda_stream
());
}
else
{
UNIMPLEMENTED
();
}
}
bool
AlwaysComputeWhenAllOutputsEmpty
()
const
override
{
return
false
;
}
};
#define REGISTER_CUDA_ARG_SORT_KERNEL(dtype) \
REGISTER_USER_KERNEL
(
"arg_sort"
)
\
.
SetCreateFn
<
GpuArgSortKernel
<
dtype
>>
()
\
.
SetIsMatchedHob
((
user_op
::
HobDeviceType
()
==
DeviceType
::
kCUDA
)
\
&&
(
user_op
::
HobDataType
(
"in"
,
0
)
==
GetDataType
<
dtype
>::
value
))
\
.
SetInferTmpSizeFn
([](
user_op
::
InferContext
*
ctx
)
{
\
const
Shape
&
in_shape
=
ctx
->
InputShape
(
"in"
,
0
);
\
const
int32_t
elem_cnt
=
in_shape
.
elem_cnt
();
\
const
int32_t
instance_size
=
in_shape
.
dim_vec
().
back
();
\
const
int32_t
instance_num
=
elem_cnt
/
instance_size
;
\
\
/* Sorted In */
\
const
int32_t
sorted_in_aligned_bytes
=
GetCudaAlignedSize
(
elem_cnt
*
sizeof
(
dtype
));
\
/* Indices */
\
const
int32_t
indices_aligned_bytes
=
GetCudaAlignedSize
(
elem_cnt
*
sizeof
(
int32_t
));
\
/* CUB Temp Storage */
\
int32_t
temp_storage_bytes
=
-
1
;
\
const
std
::
string
&
direction
=
ctx
->
Attr
<
std
::
string
>
(
"direction"
);
\
if
(
direction
==
"ASCENDING"
)
{
\
temp_storage_bytes
=
\
InferTempStorageForSortPairsAscending
<
dtype
,
int32_t
>
(
instance_num
,
instance_size
);
\
}
else
if
(
direction
==
"DESCENDING"
)
{
\
temp_storage_bytes
=
\
InferTempStorageForSortPairsDescending
<
dtype
,
int32_t
>
(
instance_num
,
instance_size
);
\
}
else
{
\
UNIMPLEMENTED
();
\
}
\
\
return
sorted_in_aligned_bytes
+
indices_aligned_bytes
+
temp_storage_bytes
;
\
});
REGISTER_CUDA_ARG_SORT_KERNEL
(
float
)
REGISTER_CUDA_ARG_SORT_KERNEL
(
double
)
REGISTER_CUDA_ARG_SORT_KERNEL
(
bool
)
REGISTER_CUDA_ARG_SORT_KERNEL
(
int8_t
)
REGISTER_CUDA_ARG_SORT_KERNEL
(
uint8_t
)
REGISTER_CUDA_ARG_SORT_KERNEL
(
int32_t
)
REGISTER_CUDA_ARG_SORT_KERNEL
(
int64_t
)
/*
Copyright 2020 The OneFlow Authors. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
#include "hip/hip_runtime.h"
#include "oneflow/core/framework/framework.h"
#include "oneflow/core/kernel/new_kernel_util.h"
#include "oneflow/user/kernels/radix_sort.hip.h"
#include "oneflow/core/ep/rocm/cuda_stream.h"
namespace
oneflow
{
namespace
{
template
<
typename
T
>
class
TmpBufferManager
final
{
public:
OF_DISALLOW_COPY_AND_MOVE
(
TmpBufferManager
);
TmpBufferManager
(
int32_t
capacity
,
void
*
ptr
,
const
ShapeView
&
in_shape
)
:
capacity_
{
capacity
},
sorted_in_elem_cnt_
{
in_shape
.
elem_cnt
()},
indices_elem_cnt_
{
sorted_in_elem_cnt_
}
{
const
int32_t
sorted_in_aligned_bytes
=
GetCudaAlignedSize
(
sorted_in_elem_cnt_
*
sizeof
(
T
));
const
int32_t
indices_aligned_bytes
=
GetCudaAlignedSize
(
indices_elem_cnt_
*
sizeof
(
int32_t
));
sorted_in_ptr_
=
reinterpret_cast
<
T
*>
(
ptr
);
indices_ptr_
=
reinterpret_cast
<
int32_t
*>
(
reinterpret_cast
<
char
*>
(
sorted_in_ptr_
)
+
sorted_in_aligned_bytes
);
temp_storage_ptr_
=
reinterpret_cast
<
void
*>
(
reinterpret_cast
<
char
*>
(
indices_ptr_
)
+
indices_aligned_bytes
);
temp_storage_bytes_
=
capacity_
-
sorted_in_aligned_bytes
-
indices_aligned_bytes
;
CHECK_GE
(
temp_storage_bytes_
,
0
);
}
~
TmpBufferManager
()
=
default
;
T
*
SortedInPtr
()
const
{
return
sorted_in_ptr_
;
}
int32_t
*
IndicesPtr
()
const
{
return
indices_ptr_
;
}
void
*
TempStoragePtr
()
const
{
return
temp_storage_ptr_
;
}
int32_t
TempStorageBytes
()
const
{
return
temp_storage_bytes_
;
}
private:
int32_t
capacity_
;
T
*
sorted_in_ptr_
;
int32_t
*
indices_ptr_
;
void
*
temp_storage_ptr_
;
int64_t
sorted_in_elem_cnt_
;
int64_t
indices_elem_cnt_
;
int32_t
temp_storage_bytes_
;
};
__global__
void
InitializeIndices
(
int32_t
elem_cnt
,
int32_t
*
indices_ptr
,
int32_t
instance_size
)
{
CUDA_1D_KERNEL_LOOP
(
i
,
elem_cnt
)
{
indices_ptr
[
i
]
=
i
%
instance_size
;
};
}
}
// namespace
template
<
typename
T
>
class
GpuArgSortKernel
final
:
public
user_op
::
OpKernel
{
public:
GpuArgSortKernel
()
=
default
;
~
GpuArgSortKernel
()
=
default
;
private:
using
user_op
::
OpKernel
::
Compute
;
void
Compute
(
user_op
::
KernelComputeContext
*
ctx
)
const
override
{
const
user_op
::
Tensor
*
in
=
ctx
->
Tensor4ArgNameAndIndex
(
"in"
,
0
);
user_op
::
Tensor
*
out
=
ctx
->
Tensor4ArgNameAndIndex
(
"out"
,
0
);
user_op
::
Tensor
*
tmp_buffer
=
ctx
->
Tensor4ArgNameAndIndex
(
"tmp_buffer"
,
0
);
TmpBufferManager
<
T
>
buf_manager
(
static_cast
<
int32_t
>
(
tmp_buffer
->
shape_view
().
elem_cnt
()),
tmp_buffer
->
mut_dptr
<
void
>
(),
in
->
shape_view
());
const
int32_t
elem_cnt
=
in
->
shape_view
().
elem_cnt
();
const
int32_t
instance_size
=
in
->
shape_view
().
At
(
in
->
shape_view
().
NumAxes
()
-
1
);
const
int32_t
instance_num
=
elem_cnt
/
instance_size
;
const
std
::
string
&
direction
=
ctx
->
Attr
<
std
::
string
>
(
"direction"
);
InitializeIndices
<<<
BlocksNum4ThreadsNum
(
elem_cnt
),
kCudaThreadsNumPerBlock
,
0
,
ctx
->
stream
()
->
As
<
ep
::
CudaStream
>
()
->
cuda_stream
()
>>>
(
elem_cnt
,
buf_manager
.
IndicesPtr
(),
instance_size
);
if
(
direction
==
"ASCENDING"
)
{
SortPairsAscending
(
in
->
dptr
<
T
>
(),
buf_manager
.
IndicesPtr
(),
instance_num
,
instance_size
,
buf_manager
.
TempStoragePtr
(),
buf_manager
.
TempStorageBytes
(),
buf_manager
.
SortedInPtr
(),
out
->
mut_dptr
<
int32_t
>
(),
ctx
->
stream
()
->
As
<
ep
::
CudaStream
>
()
->
cuda_stream
());
}
else
if
(
direction
==
"DESCENDING"
)
{
SortPairsDescending
(
in
->
dptr
<
T
>
(),
buf_manager
.
IndicesPtr
(),
instance_num
,
instance_size
,
buf_manager
.
TempStoragePtr
(),
buf_manager
.
TempStorageBytes
(),
buf_manager
.
SortedInPtr
(),
out
->
mut_dptr
<
int32_t
>
(),
ctx
->
stream
()
->
As
<
ep
::
CudaStream
>
()
->
cuda_stream
());
}
else
{
UNIMPLEMENTED
();
}
}
bool
AlwaysComputeWhenAllOutputsEmpty
()
const
override
{
return
false
;
}
};
#define REGISTER_CUDA_ARG_SORT_KERNEL(dtype) \
REGISTER_USER_KERNEL("arg_sort") \
.SetCreateFn<GpuArgSortKernel<dtype>>() \
.SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \
&& (user_op::HobDataType("in", 0) == GetDataType<dtype>::value)) \
.SetInferTmpSizeFn([](user_op::InferContext* ctx) { \
const Shape& in_shape = ctx->InputShape("in", 0); \
const int32_t elem_cnt = in_shape.elem_cnt(); \
const int32_t instance_size = in_shape.dim_vec().back(); \
const int32_t instance_num = elem_cnt / instance_size; \
\
/* Sorted In */
\
const int32_t sorted_in_aligned_bytes = GetCudaAlignedSize(elem_cnt * sizeof(dtype)); \
/* Indices */
\
const int32_t indices_aligned_bytes = GetCudaAlignedSize(elem_cnt * sizeof(int32_t)); \
/* CUB Temp Storage */
\
int32_t temp_storage_bytes = -1; \
const std::string& direction = ctx->Attr<std::string>("direction"); \
if (direction == "ASCENDING") { \
temp_storage_bytes = \
InferTempStorageForSortPairsAscending<dtype, int32_t>(instance_num, instance_size); \
} else if (direction == "DESCENDING") { \
temp_storage_bytes = \
InferTempStorageForSortPairsDescending<dtype, int32_t>(instance_num, instance_size); \
} else { \
UNIMPLEMENTED(); \
} \
\
return sorted_in_aligned_bytes + indices_aligned_bytes + temp_storage_bytes; \
});
REGISTER_CUDA_ARG_SORT_KERNEL
(
float
)
REGISTER_CUDA_ARG_SORT_KERNEL
(
double
)
REGISTER_CUDA_ARG_SORT_KERNEL
(
bool
)
REGISTER_CUDA_ARG_SORT_KERNEL
(
int8_t
)
REGISTER_CUDA_ARG_SORT_KERNEL
(
uint8_t
)
REGISTER_CUDA_ARG_SORT_KERNEL
(
int32_t
)
REGISTER_CUDA_ARG_SORT_KERNEL
(
int64_t
)
}
// namespace oneflow
\ No newline at end of file
oneflow/user/kernels/arg_where_kernel_util.hip.cpp
View file @
8f7de847
/*
Copyright 2020 The OneFlow Authors. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
#include "hip/hip_runtime.h"
#include "oneflow/user/kernels/arg_where_kernel_util.h"
#include "oneflow/core/common/nd_index_offset_helper.h"
#include "oneflow/core/common/small_vector.h"
#include "oneflow/core/hip/elementwise.hip.h"
#include "oneflow/core/kernel/kernel_util.h"
#include "oneflow/core/ep/rocm/cuda_stream.h"
#include <hipcub/hipcub.hpp>
namespace
oneflow
{
namespace
{
constexpr
int
kBlockSize
=
cuda
::
elementwise
::
kBlockSize
;
int
GetNumBlocks
(
int64_t
elem_cnt
)
{
int
num_blocks
=
0
;
OF_CUDA_CHECK
(
cuda
::
elementwise
::
GetNumBlocks
(
elem_cnt
,
&
num_blocks
));
return
num_blocks
;
}
template
<
typename
T
,
int
NDIM
>
struct
StrideIterator
{
typedef
StrideIterator
self_type
;
typedef
std
::
ptrdiff_t
difference_type
;
typedef
T
value_type
;
typedef
T
*
pointer
;
typedef
T
&
reference
;
typedef
std
::
random_access_iterator_tag
iterator_category
;
explicit
StrideIterator
(
T
*
ptr
,
size_t
max_iters
)
:
ptr_
(
ptr
),
max_iters_
(
max_iters
)
{}
OF_DEVICE_FUNC
reference
operator
[](
int
i
)
{
assert
(
0
<=
i
&&
i
<
max_iters_
);
return
*
(
ptr_
+
(
i
*
NDIM
));
}
private:
T
*
ptr_
;
size_t
max_iters_
;
};
template
<
typename
T
,
int
NDIM
>
__global__
void
__launch_bounds__
(
kBlockSize
)
CudaOffsetToNdIndexInplace
(
NdIndexOffsetHelper
<
T
,
NDIM
>
index_converter
,
const
T
*
output_size_ptr
,
T
*
output_ptr
)
{
CUDA_1D_KERNEL_LOOP_T
(
T
,
i
,
*
output_size_ptr
)
{
T
*
index_ptr
=
output_ptr
+
i
*
NDIM
;
index_converter
.
OffsetToNdIndex
(
*
index_ptr
,
index_ptr
);
}
}
template
<
typename
T
>
struct
IsTrue
{
__device__
__forceinline__
bool
operator
()(
const
T
&
val
)
const
{
return
static_cast
<
bool
>
(
val
);
}
};
template
<
typename
IN_T
,
typename
OUT_T
,
typename
OUT_ITER
>
hipError_t
SelectTrue
(
hipStream_t
stream
,
int
num_items
,
void
*
temp_storage
,
size_t
&
temp_storage_bytes
,
const
IN_T
*
input
,
OUT_ITER
output_iter
,
OUT_T
*
num_selected
)
{
IsTrue
<
IN_T
>
is_true
;
hipcub
::
TransformInputIterator
<
bool
,
IsTrue
<
IN_T
>
,
const
IN_T
*>
flag_iter
(
input
,
is_true
);
hipcub
::
CountingInputIterator
<
OUT_T
>
offset_counter
(
0
);
return
hipcub
::
DeviceSelect
::
Flagged
(
temp_storage
,
temp_storage_bytes
,
offset_counter
,
flag_iter
,
output_iter
,
num_selected
,
num_items
,
stream
,
false
);
}
}
// namespace
template
<
typename
IN_T
,
typename
OUT_T
,
int
NDIM
>
struct
ArgWhereKernelUtil
<
DeviceType
::
kCUDA
,
IN_T
,
OUT_T
,
NDIM
>
{
static
void
ArgWhere
(
ep
::
Stream
*
stream
,
const
ShapeView
&
input_shape
,
const
IN_T
*
input_ptr
,
void
*
temp_storage
,
size_t
temp_storage_bytes
,
OUT_T
*
output_ptr
,
OUT_T
*
output_size_ptr
)
{
const
int64_t
elem_cnt
=
input_shape
.
elem_cnt
();
// deal with empty blob
if
(
elem_cnt
==
0
)
{
Memset
<
DeviceType
::
kCUDA
>
(
stream
,
output_size_ptr
,
0
,
sizeof
(
OUT_T
));
return
;
}
CHECK_NOTNULL
(
stream
);
CHECK_LE
(
elem_cnt
,
std
::
numeric_limits
<
OUT_T
>::
max
());
size_t
workspace
=
GetWorkspaceBytesSize
(
stream
,
elem_cnt
);
CHECK_LE
(
workspace
,
temp_storage_bytes
);
if
(
NDIM
==
1
)
{
OF_CUDA_CHECK
((
SelectTrue
<
IN_T
,
OUT_T
,
OUT_T
*>
(
stream
->
As
<
ep
::
CudaStream
>
()
->
cuda_stream
(),
input_shape
.
elem_cnt
(),
temp_storage
,
workspace
,
input_ptr
,
output_ptr
,
output_size_ptr
)));
}
else
{
using
OutputIterator
=
StrideIterator
<
OUT_T
,
NDIM
>
;
OutputIterator
output_iter
(
output_ptr
,
elem_cnt
);
OF_CUDA_CHECK
((
SelectTrue
<
IN_T
,
OUT_T
,
OutputIterator
>
(
stream
->
As
<
ep
::
CudaStream
>
()
->
cuda_stream
(),
elem_cnt
,
temp_storage
,
workspace
,
input_ptr
,
output_iter
,
output_size_ptr
)));
OUT_T
dims
[
NDIM
]
=
{
0
};
std
::
transform
(
input_shape
.
ptr
(),
input_shape
.
ptr
()
+
input_shape
.
NumAxes
(),
dims
,
[](
int64_t
dim
)
{
return
static_cast
<
OUT_T
>
(
dim
);
});
NdIndexOffsetHelper
<
OUT_T
,
NDIM
>
index_converter
(
dims
);
CudaOffsetToNdIndexInplace
<
OUT_T
,
NDIM
>
<<<
GetNumBlocks
(
elem_cnt
),
kBlockSize
,
0
,
stream
->
As
<
ep
::
CudaStream
>
()
->
cuda_stream
()
>>>
(
index_converter
,
output_size_ptr
,
output_ptr
);
}
}
static
size_t
GetWorkspaceBytesSize
(
ep
::
Stream
*
stream
,
int64_t
elem_cnt
)
{
hipStream_t
cuda_stream
=
stream
?
stream
->
As
<
ep
::
CudaStream
>
()
->
cuda_stream
()
:
0
;
size_t
workspace
=
0
;
if
(
NDIM
==
1
)
{
OF_CUDA_CHECK
((
SelectTrue
<
IN_T
,
OUT_T
,
OUT_T
*>
(
cuda_stream
,
elem_cnt
,
nullptr
,
workspace
,
nullptr
,
nullptr
,
nullptr
)));
}
else
{
using
OutputIterator
=
StrideIterator
<
OUT_T
,
NDIM
>
;
OutputIterator
output_iter
(
nullptr
,
elem_cnt
);
OF_CUDA_CHECK
((
SelectTrue
<
IN_T
,
OUT_T
,
OutputIterator
>
(
cuda_stream
,
elem_cnt
,
nullptr
,
workspace
,
nullptr
,
output_iter
,
nullptr
)));
}
return
workspace
;
}
};
INSTANTIATE_ARG_WHERE_KERNEL_UTIL_FOR_DEVICE
(
DeviceType
::
kCUDA
)
/*
Copyright 2020 The OneFlow Authors. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
#include "hip/hip_runtime.h"
#include "oneflow/user/kernels/arg_where_kernel_util.h"
#include "oneflow/core/common/nd_index_offset_helper.h"
#include "oneflow/core/common/small_vector.h"
#include "oneflow/core/hip/elementwise.hip.h"
#include "oneflow/core/kernel/kernel_util.h"
#include "oneflow/core/ep/rocm/cuda_stream.h"
#include <hipcub/hipcub.hpp>
namespace
oneflow
{
namespace
{
constexpr
int
kBlockSize
=
cuda
::
elementwise
::
kBlockSize
;
int
GetNumBlocks
(
int64_t
elem_cnt
)
{
int
num_blocks
=
0
;
OF_CUDA_CHECK
(
cuda
::
elementwise
::
GetNumBlocks
(
elem_cnt
,
&
num_blocks
));
return
num_blocks
;
}
template
<
typename
T
,
int
NDIM
>
struct
StrideIterator
{
typedef
StrideIterator
self_type
;
typedef
std
::
ptrdiff_t
difference_type
;
typedef
T
value_type
;
typedef
T
*
pointer
;
typedef
T
&
reference
;
typedef
std
::
random_access_iterator_tag
iterator_category
;
explicit
StrideIterator
(
T
*
ptr
,
size_t
max_iters
)
:
ptr_
(
ptr
),
max_iters_
(
max_iters
)
{}
OF_DEVICE_FUNC
reference
operator
[](
int
i
)
{
assert
(
0
<=
i
&&
i
<
max_iters_
);
return
*
(
ptr_
+
(
i
*
NDIM
));
}
private:
T
*
ptr_
;
size_t
max_iters_
;
};
template
<
typename
T
,
int
NDIM
>
__global__
void
__launch_bounds__
(
kBlockSize
)
CudaOffsetToNdIndexInplace
(
NdIndexOffsetHelper
<
T
,
NDIM
>
index_converter
,
const
T
*
output_size_ptr
,
T
*
output_ptr
)
{
CUDA_1D_KERNEL_LOOP_T
(
T
,
i
,
*
output_size_ptr
)
{
T
*
index_ptr
=
output_ptr
+
i
*
NDIM
;
index_converter
.
OffsetToNdIndex
(
*
index_ptr
,
index_ptr
);
}
}
template
<
typename
T
>
struct
IsTrue
{
__device__
__forceinline__
bool
operator
()(
const
T
&
val
)
const
{
return
static_cast
<
bool
>
(
val
);
}
};
template
<
typename
IN_T
,
typename
OUT_T
,
typename
OUT_ITER
>
hipError_t
SelectTrue
(
hipStream_t
stream
,
int
num_items
,
void
*
temp_storage
,
size_t
&
temp_storage_bytes
,
const
IN_T
*
input
,
OUT_ITER
output_iter
,
OUT_T
*
num_selected
)
{
IsTrue
<
IN_T
>
is_true
;
hipcub
::
TransformInputIterator
<
bool
,
IsTrue
<
IN_T
>
,
const
IN_T
*>
flag_iter
(
input
,
is_true
);
hipcub
::
CountingInputIterator
<
OUT_T
>
offset_counter
(
0
);
return
hipcub
::
DeviceSelect
::
Flagged
(
temp_storage
,
temp_storage_bytes
,
offset_counter
,
flag_iter
,
output_iter
,
num_selected
,
num_items
,
stream
,
false
);
}
}
// namespace
template
<
typename
IN_T
,
typename
OUT_T
,
int
NDIM
>
struct
ArgWhereKernelUtil
<
DeviceType
::
kCUDA
,
IN_T
,
OUT_T
,
NDIM
>
{
static
void
ArgWhere
(
ep
::
Stream
*
stream
,
const
ShapeView
&
input_shape
,
const
IN_T
*
input_ptr
,
void
*
temp_storage
,
size_t
temp_storage_bytes
,
OUT_T
*
output_ptr
,
OUT_T
*
output_size_ptr
)
{
const
int64_t
elem_cnt
=
input_shape
.
elem_cnt
();
// deal with empty blob
if
(
elem_cnt
==
0
)
{
Memset
<
DeviceType
::
kCUDA
>
(
stream
,
output_size_ptr
,
0
,
sizeof
(
OUT_T
));
return
;
}
CHECK_NOTNULL
(
stream
);
CHECK_LE
(
elem_cnt
,
std
::
numeric_limits
<
OUT_T
>::
max
());
size_t
workspace
=
GetWorkspaceBytesSize
(
stream
,
elem_cnt
);
CHECK_LE
(
workspace
,
temp_storage_bytes
);
if
(
NDIM
==
1
)
{
OF_CUDA_CHECK
((
SelectTrue
<
IN_T
,
OUT_T
,
OUT_T
*>
(
stream
->
As
<
ep
::
CudaStream
>
()
->
cuda_stream
(),
input_shape
.
elem_cnt
(),
temp_storage
,
workspace
,
input_ptr
,
output_ptr
,
output_size_ptr
)));
}
else
{
using
OutputIterator
=
StrideIterator
<
OUT_T
,
NDIM
>
;
OutputIterator
output_iter
(
output_ptr
,
elem_cnt
);
OF_CUDA_CHECK
((
SelectTrue
<
IN_T
,
OUT_T
,
OutputIterator
>
(
stream
->
As
<
ep
::
CudaStream
>
()
->
cuda_stream
(),
elem_cnt
,
temp_storage
,
workspace
,
input_ptr
,
output_iter
,
output_size_ptr
)));
OUT_T
dims
[
NDIM
]
=
{
0
};
std
::
transform
(
input_shape
.
ptr
(),
input_shape
.
ptr
()
+
input_shape
.
NumAxes
(),
dims
,
[](
int64_t
dim
)
{
return
static_cast
<
OUT_T
>
(
dim
);
});
NdIndexOffsetHelper
<
OUT_T
,
NDIM
>
index_converter
(
dims
);
CudaOffsetToNdIndexInplace
<
OUT_T
,
NDIM
>
<<<
GetNumBlocks
(
elem_cnt
),
kBlockSize
,
0
,
stream
->
As
<
ep
::
CudaStream
>
()
->
cuda_stream
()
>>>
(
index_converter
,
output_size_ptr
,
output_ptr
);
}
}
static
size_t
GetWorkspaceBytesSize
(
ep
::
Stream
*
stream
,
int64_t
elem_cnt
)
{
hipStream_t
cuda_stream
=
stream
?
stream
->
As
<
ep
::
CudaStream
>
()
->
cuda_stream
()
:
0
;
size_t
workspace
=
0
;
if
(
NDIM
==
1
)
{
OF_CUDA_CHECK
((
SelectTrue
<
IN_T
,
OUT_T
,
OUT_T
*>
(
cuda_stream
,
elem_cnt
,
nullptr
,
workspace
,
nullptr
,
nullptr
,
nullptr
)));
}
else
{
using
OutputIterator
=
StrideIterator
<
OUT_T
,
NDIM
>
;
OutputIterator
output_iter
(
nullptr
,
elem_cnt
);
OF_CUDA_CHECK
((
SelectTrue
<
IN_T
,
OUT_T
,
OutputIterator
>
(
cuda_stream
,
elem_cnt
,
nullptr
,
workspace
,
nullptr
,
output_iter
,
nullptr
)));
}
return
workspace
;
}
};
INSTANTIATE_ARG_WHERE_KERNEL_UTIL_FOR_DEVICE
(
DeviceType
::
kCUDA
)
}
// namespace oneflow
\ No newline at end of file
oneflow/user/kernels/argmax_kernel.hip.cpp
View file @
8f7de847
/*
Copyright 2020 The OneFlow Authors. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
#include "hip/hip_runtime.h"
#include "oneflow/core/framework/framework.h"
#include <hipcub/hipcub.hpp>
#include "oneflow/core/ep/rocm/cuda_stream.h"
namespace
oneflow
{
namespace
{
template
<
typename
T
>
class
TmpBufferManager
final
{
public:
OF_DISALLOW_COPY_AND_MOVE
(
TmpBufferManager
);
TmpBufferManager
(
int32_t
capacity
,
void
*
ptr
,
int32_t
instance_num
)
:
capacity_
{
capacity
},
key_value_out_elem_cnt_
{
instance_num
}
{
const
int32_t
key_value_out_aligned_bytes
=
GetCudaAlignedSize
(
key_value_out_elem_cnt_
*
sizeof
(
hipcub
::
KeyValuePair
<
int32_t
,
T
>
));
key_value_out_ptr_
=
reinterpret_cast
<
hipcub
::
KeyValuePair
<
int32_t
,
T
>*>
(
ptr
);
temp_storage_ptr_
=
reinterpret_cast
<
void
*>
(
reinterpret_cast
<
char
*>
(
key_value_out_ptr_
)
+
key_value_out_aligned_bytes
);
temp_storage_bytes_
=
capacity_
-
key_value_out_aligned_bytes
;
CHECK_GE
(
temp_storage_bytes_
,
0
);
}
~
TmpBufferManager
()
=
default
;
hipcub
::
KeyValuePair
<
int32_t
,
T
>*
KeyValueOutPtr
()
const
{
return
key_value_out_ptr_
;
}
void
*
TempStoragePtr
()
const
{
return
temp_storage_ptr_
;
}
int32_t
TempStorageBytes
()
const
{
return
temp_storage_bytes_
;
}
private:
int32_t
capacity_
;
hipcub
::
KeyValuePair
<
int32_t
,
T
>*
key_value_out_ptr_
;
void
*
temp_storage_ptr_
;
int32_t
key_value_out_elem_cnt_
;
int32_t
temp_storage_bytes_
;
};
class
MultiplyFunctor
final
{
public:
MultiplyFunctor
(
int32_t
num_col
)
:
num_col_
(
num_col
)
{}
__host__
__device__
__forceinline__
int32_t
operator
()(
int32_t
idx
)
const
{
return
idx
*
num_col_
;
}
private:
int32_t
num_col_
;
};
template
<
typename
T
>
size_t
InferTempStorageForArgMax
(
int32_t
num_row
,
int32_t
num_col
)
{
using
SegmentOffsetIter
=
hipcub
::
TransformInputIterator
<
int32_t
,
MultiplyFunctor
,
hipcub
::
CountingInputIterator
<
int32_t
>>
;
hipcub
::
CountingInputIterator
<
int32_t
>
counting_iter
(
0
);
MultiplyFunctor
multiply_functor
(
num_col
);
SegmentOffsetIter
segment_offset_iter
(
counting_iter
,
multiply_functor
);
size_t
temp_storage_bytes
=
0
;
auto
err
=
hipcub
::
DeviceSegmentedReduce
::
ArgMax
<
T
*
,
hipcub
::
KeyValuePair
<
int32_t
,
T
>*
,
SegmentOffsetIter
>
(
/* d_temp_storage */
nullptr
,
/* temp_storage_bytes */
temp_storage_bytes
,
/* d_in */
nullptr
,
/* d_out */
nullptr
,
/* num_segments */
num_row
,
/* d_begin_offsets */
segment_offset_iter
,
/* d_end_offsets */
segment_offset_iter
+
1
,
/* stream */
0
);
// auto err =
// hipcub::DeviceReduce::ArgMax<T*, hipcub::KeyValuePair<int32_t, T>*>(
// nullptr, temp_storage_bytes,
// nullptr, nullptr, num_row,
// 0);
OF_CUDA_CHECK
(
err
);
return
temp_storage_bytes
;
}
template
<
typename
T
>
void
ArgMax
(
const
T
*
in_ptr
,
int32_t
num_row
,
int32_t
num_col
,
void
*
temp_storage_ptr
,
int32_t
temp_storage_bytes
,
hipcub
::
KeyValuePair
<
int32_t
,
T
>*
out_ptr
,
hipStream_t
stream
)
{
size_t
rt_inferred_temp_storage_bytes
=
InferTempStorageForArgMax
<
T
>
(
num_row
,
num_col
);
CHECK_LE
(
rt_inferred_temp_storage_bytes
,
temp_storage_bytes
);
using
SegmentOffsetIter
=
hipcub
::
TransformInputIterator
<
int32_t
,
MultiplyFunctor
,
hipcub
::
CountingInputIterator
<
int32_t
>>
;
hipcub
::
CountingInputIterator
<
int32_t
>
counting_iter
(
0
);
MultiplyFunctor
multiply_functor
(
num_col
);
SegmentOffsetIter
segment_offset_iter
(
counting_iter
,
multiply_functor
);
// void * d_temp_storage = nullptr;
// hipMalloc((void **)&d_temp_storage, rt_inferred_temp_storage_bytes);
auto
err
=
hipcub
::
DeviceSegmentedReduce
::
ArgMax
(
/* d_temp_storage */
temp_storage_ptr
,
/* temp_storage_bytes */
rt_inferred_temp_storage_bytes
,
/* d_in */
in_ptr
,
/* d_out */
out_ptr
,
/* num_segments */
num_row
,
/* d_begin_offsets */
segment_offset_iter
,
/* d_end_offsets */
segment_offset_iter
+
1
,
/* stream */
stream
);
// auto err =
// hipcub::DeviceReduce::ArgMax(
// d_temp_storage, rt_inferred_temp_storage_bytes,
// in_ptr, out_ptr, num_row,
// stream);
OF_CUDA_CHECK
(
err
);
}
template
<
typename
T
>
__global__
void
WriteKeysToOutput
(
const
int32_t
instance_num
,
const
int32_t
instance_size
,
const
hipcub
::
KeyValuePair
<
int32_t
,
T
>*
key_value_out_ptr
,
int64_t
*
out_ptr
)
{
CUDA_1D_KERNEL_LOOP
(
i
,
instance_num
)
{
out_ptr
[
i
]
=
key_value_out_ptr
[
i
].
key
%
instance_size
;
}
}
}
// namespace
template
<
typename
T
>
class
GpuArgMaxKernel
final
:
public
user_op
::
OpKernel
{
public:
GpuArgMaxKernel
()
=
default
;
~
GpuArgMaxKernel
()
=
default
;
private:
using
user_op
::
OpKernel
::
Compute
;
void
Compute
(
user_op
::
KernelComputeContext
*
ctx
)
const
override
{
const
user_op
::
Tensor
*
in
=
ctx
->
Tensor4ArgNameAndIndex
(
"in"
,
0
);
user_op
::
Tensor
*
out
=
ctx
->
Tensor4ArgNameAndIndex
(
"out"
,
0
);
user_op
::
Tensor
*
tmp_buffer
=
ctx
->
Tensor4ArgNameAndIndex
(
"tmp_buffer"
,
0
);
const
int32_t
elem_cnt
=
in
->
shape_view
().
elem_cnt
();
const
int32_t
instance_size
=
in
->
shape_view
().
At
(
in
->
shape_view
().
NumAxes
()
-
1
);
const
int32_t
instance_num
=
elem_cnt
/
instance_size
;
TmpBufferManager
<
T
>
buffer_manager
(
tmp_buffer
->
shape_view
().
elem_cnt
(),
tmp_buffer
->
mut_dptr
<
void
>
(),
instance_num
);
ArgMax
(
in
->
dptr
<
T
>
(),
instance_num
,
instance_size
,
buffer_manager
.
TempStoragePtr
(),
buffer_manager
.
TempStorageBytes
(),
buffer_manager
.
KeyValueOutPtr
(),
ctx
->
stream
()
->
As
<
ep
::
CudaStream
>
()
->
cuda_stream
());
WriteKeysToOutput
<
T
><<<
BlocksNum4ThreadsNum
(
instance_num
),
kCudaThreadsNumPerBlock
,
0
,
ctx
->
stream
()
->
As
<
ep
::
CudaStream
>
()
->
cuda_stream
()
>>>
(
instance_num
,
instance_size
,
buffer_manager
.
KeyValueOutPtr
(),
out
->
mut_dptr
<
int64_t
>
());
}
bool
AlwaysComputeWhenAllOutputsEmpty
()
const
override
{
return
false
;
}
};
#define REGISTER_CUDA_ARGMAX_KERNEL(dtype) \
REGISTER_USER_KERNEL
(
"argmax"
)
\
.
SetCreateFn
<
GpuArgMaxKernel
<
dtype
>>
()
\
.
SetIsMatchedHob
((
user_op
::
HobDeviceType
()
==
DeviceType
::
kCUDA
)
\
&&
(
user_op
::
HobDataType
(
"in"
,
0
)
==
GetDataType
<
dtype
>::
value
))
\
.
SetInferTmpSizeFn
([](
user_op
::
InferContext
*
ctx
)
{
\
const
Shape
&
in_shape
=
ctx
->
InputShape
(
"in"
,
0
);
\
const
int32_t
instance_size
=
in_shape
.
dim_vec
().
back
();
\
const
int32_t
instance_num
=
in_shape
.
elem_cnt
()
/
instance_size
;
\
\
/* Key-Value Out */
\
int32_t
key_value_out_bytes
=
\
GetCudaAlignedSize
(
instance_num
*
sizeof
(
hipcub
::
KeyValuePair
<
int32_t
,
dtype
>
));
\
\
/* CUB Temp Storage */
\
size_t
temp_storage_bytes
=
InferTempStorageForArgMax
<
dtype
>
(
instance_num
,
instance_size
);
\
\
return
key_value_out_bytes
+
temp_storage_bytes
;
\
});
REGISTER_CUDA_ARGMAX_KERNEL
(
float
)
REGISTER_CUDA_ARGMAX_KERNEL
(
double
)
REGISTER_CUDA_ARGMAX_KERNEL
(
uint8_t
)
REGISTER_CUDA_ARGMAX_KERNEL
(
int8_t
)
REGISTER_CUDA_ARGMAX_KERNEL
(
int32_t
)
REGISTER_CUDA_ARGMAX_KERNEL
(
int64_t
)
/*
Copyright 2020 The OneFlow Authors. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
#include "hip/hip_runtime.h"
#include "oneflow/core/framework/framework.h"
#include <hipcub/hipcub.hpp>
#include "oneflow/core/ep/rocm/cuda_stream.h"
namespace
oneflow
{
namespace
{
template
<
typename
T
>
class
TmpBufferManager
final
{
public:
OF_DISALLOW_COPY_AND_MOVE
(
TmpBufferManager
);
TmpBufferManager
(
int32_t
capacity
,
void
*
ptr
,
int32_t
instance_num
)
:
capacity_
{
capacity
},
key_value_out_elem_cnt_
{
instance_num
}
{
const
int32_t
key_value_out_aligned_bytes
=
GetCudaAlignedSize
(
key_value_out_elem_cnt_
*
sizeof
(
hipcub
::
KeyValuePair
<
int32_t
,
T
>
));
key_value_out_ptr_
=
reinterpret_cast
<
hipcub
::
KeyValuePair
<
int32_t
,
T
>*>
(
ptr
);
temp_storage_ptr_
=
reinterpret_cast
<
void
*>
(
reinterpret_cast
<
char
*>
(
key_value_out_ptr_
)
+
key_value_out_aligned_bytes
);
temp_storage_bytes_
=
capacity_
-
key_value_out_aligned_bytes
;
CHECK_GE
(
temp_storage_bytes_
,
0
);
}
~
TmpBufferManager
()
=
default
;
hipcub
::
KeyValuePair
<
int32_t
,
T
>*
KeyValueOutPtr
()
const
{
return
key_value_out_ptr_
;
}
void
*
TempStoragePtr
()
const
{
return
temp_storage_ptr_
;
}
int32_t
TempStorageBytes
()
const
{
return
temp_storage_bytes_
;
}
private:
int32_t
capacity_
;
hipcub
::
KeyValuePair
<
int32_t
,
T
>*
key_value_out_ptr_
;
void
*
temp_storage_ptr_
;
int32_t
key_value_out_elem_cnt_
;
int32_t
temp_storage_bytes_
;
};
class
MultiplyFunctor
final
{
public:
MultiplyFunctor
(
int32_t
num_col
)
:
num_col_
(
num_col
)
{}
__host__
__device__
__forceinline__
int32_t
operator
()(
int32_t
idx
)
const
{
return
idx
*
num_col_
;
}
private:
int32_t
num_col_
;
};
template
<
typename
T
>
size_t
InferTempStorageForArgMax
(
int32_t
num_row
,
int32_t
num_col
)
{
using
SegmentOffsetIter
=
hipcub
::
TransformInputIterator
<
int32_t
,
MultiplyFunctor
,
hipcub
::
CountingInputIterator
<
int32_t
>>
;
hipcub
::
CountingInputIterator
<
int32_t
>
counting_iter
(
0
);
MultiplyFunctor
multiply_functor
(
num_col
);
SegmentOffsetIter
segment_offset_iter
(
counting_iter
,
multiply_functor
);
size_t
temp_storage_bytes
=
0
;
auto
err
=
hipcub
::
DeviceSegmentedReduce
::
ArgMax
<
T
*
,
hipcub
::
KeyValuePair
<
int32_t
,
T
>*
,
SegmentOffsetIter
>
(
/* d_temp_storage */
nullptr
,
/* temp_storage_bytes */
temp_storage_bytes
,
/* d_in */
nullptr
,
/* d_out */
nullptr
,
/* num_segments */
num_row
,
/* d_begin_offsets */
segment_offset_iter
,
/* d_end_offsets */
segment_offset_iter
+
1
,
/* stream */
0
);
// auto err =
// hipcub::DeviceReduce::ArgMax<T*, hipcub::KeyValuePair<int32_t, T>*>(
// nullptr, temp_storage_bytes,
// nullptr, nullptr, num_row,
// 0);
OF_CUDA_CHECK
(
err
);
return
temp_storage_bytes
;
}
template
<
typename
T
>
void
ArgMax
(
const
T
*
in_ptr
,
int32_t
num_row
,
int32_t
num_col
,
void
*
temp_storage_ptr
,
int32_t
temp_storage_bytes
,
hipcub
::
KeyValuePair
<
int32_t
,
T
>*
out_ptr
,
hipStream_t
stream
)
{
size_t
rt_inferred_temp_storage_bytes
=
InferTempStorageForArgMax
<
T
>
(
num_row
,
num_col
);
CHECK_LE
(
rt_inferred_temp_storage_bytes
,
temp_storage_bytes
);
using
SegmentOffsetIter
=
hipcub
::
TransformInputIterator
<
int32_t
,
MultiplyFunctor
,
hipcub
::
CountingInputIterator
<
int32_t
>>
;
hipcub
::
CountingInputIterator
<
int32_t
>
counting_iter
(
0
);
MultiplyFunctor
multiply_functor
(
num_col
);
SegmentOffsetIter
segment_offset_iter
(
counting_iter
,
multiply_functor
);
// void * d_temp_storage = nullptr;
// hipMalloc((void **)&d_temp_storage, rt_inferred_temp_storage_bytes);
auto
err
=
hipcub
::
DeviceSegmentedReduce
::
ArgMax
(
/* d_temp_storage */
temp_storage_ptr
,
/* temp_storage_bytes */
rt_inferred_temp_storage_bytes
,
/* d_in */
in_ptr
,
/* d_out */
out_ptr
,
/* num_segments */
num_row
,
/* d_begin_offsets */
segment_offset_iter
,
/* d_end_offsets */
segment_offset_iter
+
1
,
/* stream */
stream
);
// auto err =
// hipcub::DeviceReduce::ArgMax(
// d_temp_storage, rt_inferred_temp_storage_bytes,
// in_ptr, out_ptr, num_row,
// stream);
OF_CUDA_CHECK
(
err
);
}
template
<
typename
T
>
__global__
void
WriteKeysToOutput
(
const
int32_t
instance_num
,
const
int32_t
instance_size
,
const
hipcub
::
KeyValuePair
<
int32_t
,
T
>*
key_value_out_ptr
,
int64_t
*
out_ptr
)
{
CUDA_1D_KERNEL_LOOP
(
i
,
instance_num
)
{
out_ptr
[
i
]
=
key_value_out_ptr
[
i
].
key
%
instance_size
;
}
}
}
// namespace
template
<
typename
T
>
class
GpuArgMaxKernel
final
:
public
user_op
::
OpKernel
{
public:
GpuArgMaxKernel
()
=
default
;
~
GpuArgMaxKernel
()
=
default
;
private:
using
user_op
::
OpKernel
::
Compute
;
void
Compute
(
user_op
::
KernelComputeContext
*
ctx
)
const
override
{
const
user_op
::
Tensor
*
in
=
ctx
->
Tensor4ArgNameAndIndex
(
"in"
,
0
);
user_op
::
Tensor
*
out
=
ctx
->
Tensor4ArgNameAndIndex
(
"out"
,
0
);
user_op
::
Tensor
*
tmp_buffer
=
ctx
->
Tensor4ArgNameAndIndex
(
"tmp_buffer"
,
0
);
const
int32_t
elem_cnt
=
in
->
shape_view
().
elem_cnt
();
const
int32_t
instance_size
=
in
->
shape_view
().
At
(
in
->
shape_view
().
NumAxes
()
-
1
);
const
int32_t
instance_num
=
elem_cnt
/
instance_size
;
TmpBufferManager
<
T
>
buffer_manager
(
tmp_buffer
->
shape_view
().
elem_cnt
(),
tmp_buffer
->
mut_dptr
<
void
>
(),
instance_num
);
ArgMax
(
in
->
dptr
<
T
>
(),
instance_num
,
instance_size
,
buffer_manager
.
TempStoragePtr
(),
buffer_manager
.
TempStorageBytes
(),
buffer_manager
.
KeyValueOutPtr
(),
ctx
->
stream
()
->
As
<
ep
::
CudaStream
>
()
->
cuda_stream
());
WriteKeysToOutput
<
T
><<<
BlocksNum4ThreadsNum
(
instance_num
),
kCudaThreadsNumPerBlock
,
0
,
ctx
->
stream
()
->
As
<
ep
::
CudaStream
>
()
->
cuda_stream
()
>>>
(
instance_num
,
instance_size
,
buffer_manager
.
KeyValueOutPtr
(),
out
->
mut_dptr
<
int64_t
>
());
}
bool
AlwaysComputeWhenAllOutputsEmpty
()
const
override
{
return
false
;
}
};
#define REGISTER_CUDA_ARGMAX_KERNEL(dtype) \
REGISTER_USER_KERNEL("argmax") \
.SetCreateFn<GpuArgMaxKernel<dtype>>() \
.SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \
&& (user_op::HobDataType("in", 0) == GetDataType<dtype>::value)) \
.SetInferTmpSizeFn([](user_op::InferContext* ctx) { \
const Shape& in_shape = ctx->InputShape("in", 0); \
const int32_t instance_size = in_shape.dim_vec().back(); \
const int32_t instance_num = in_shape.elem_cnt() / instance_size; \
\
/* Key-Value Out */
\
int32_t key_value_out_bytes = \
GetCudaAlignedSize(instance_num * sizeof(hipcub::KeyValuePair<int32_t, dtype>)); \
\
/* CUB Temp Storage */
\
size_t temp_storage_bytes = InferTempStorageForArgMax<dtype>(instance_num, instance_size); \
\
return key_value_out_bytes + temp_storage_bytes; \
});
REGISTER_CUDA_ARGMAX_KERNEL
(
float
)
REGISTER_CUDA_ARGMAX_KERNEL
(
double
)
REGISTER_CUDA_ARGMAX_KERNEL
(
uint8_t
)
REGISTER_CUDA_ARGMAX_KERNEL
(
int8_t
)
REGISTER_CUDA_ARGMAX_KERNEL
(
int32_t
)
REGISTER_CUDA_ARGMAX_KERNEL
(
int64_t
)
}
// namespace oneflow
\ No newline at end of file
oneflow/user/kernels/as_strided_kernel.hip.cpp
View file @
8f7de847
#include "hip/hip_runtime.h"
/*
Copyright 2020 The OneFlow Authors. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
#include <cstdint>
#include "oneflow/core/hip/atomic.hip.h"
#include "oneflow/core/common/just.h"
#include "oneflow/core/common/util.h"
#include "oneflow/core/framework/consistency_check.h"
#include "oneflow/core/framework/framework.h"
#include "oneflow/core/kernel/new_kernel_util.h"
#include "oneflow/core/kernel/kernel_util.h"
#include "oneflow/core/ep/rocm/cuda_stream.h"
#include "oneflow/core/common/nd_index_offset_helper.h"
namespace
oneflow
{
namespace
{
constexpr
size_t
NUM_DIM
=
8
;
template
<
size_t
num_dims
,
typename
IndexType
>
struct
AsStridedParams
{
NdIndexOffsetHelper
<
IndexType
,
num_dims
>
destIndexOffsetHelper
;
int64_t
dest_dims
[
num_dims
];
int32_t
stride
[
num_dims
];
int32_t
dest_num_dims
;
int32_t
storage_offset
;
int32_t
input_num
;
int32_t
output_num
;
};
template
<
typename
T
>
__global__
void
AsStrided_kernel
(
const
T
*
input_buf
,
T
*
output_buf
,
AsStridedParams
<
NUM_DIM
,
int64_t
>
params
)
{
const
int64_t
*
dest_dims
=
reinterpret_cast
<
const
int64_t
*>
(
params
.
dest_dims
);
const
int32_t
*
stride
=
reinterpret_cast
<
const
int32_t
*>
(
params
.
stride
);
CUDA_1D_KERNEL_LOOP_T
(
int64_t
,
i
,
params
.
output_num
)
{
int64_t
dst_index
[
NUM_DIM
];
params
.
destIndexOffsetHelper
.
OffsetToNdIndex
(
i
,
dst_index
,
params
.
dest_num_dims
);
int32_t
index_in_input
=
params
.
storage_offset
;
FOR_RANGE
(
int64_t
,
j
,
0
,
params
.
dest_num_dims
)
{
index_in_input
+=
dst_index
[
j
]
*
stride
[
j
];
}
output_buf
[
i
]
=
input_buf
[
index_in_input
];
}
}
template
<
typename
T
>
__global__
void
AsStridedGrad_kernel
(
const
T
*
dy_buf
,
T
*
dx_buf
,
AsStridedParams
<
NUM_DIM
,
int64_t
>
params
)
{
const
int64_t
*
dest_dims
=
reinterpret_cast
<
const
int64_t
*>
(
params
.
dest_dims
);
const
int32_t
*
stride
=
reinterpret_cast
<
const
int32_t
*>
(
params
.
stride
);
CUDA_1D_KERNEL_LOOP_T
(
int64_t
,
i
,
params
.
output_num
)
{
int64_t
dy_index
[
NUM_DIM
];
params
.
destIndexOffsetHelper
.
OffsetToNdIndex
(
i
,
dy_index
,
params
.
dest_num_dims
);
int32_t
index_in_dx
=
params
.
storage_offset
;
FOR_RANGE
(
int64_t
,
j
,
0
,
params
.
dest_num_dims
)
{
index_in_dx
+=
dy_index
[
j
]
*
stride
[
j
];
}
cuda
::
atomic
::
Add
(
dx_buf
+
index_in_dx
,
dy_buf
[
i
]);
}
}
template
<
typename
T
>
struct
AsStridedFunctor
final
{
void
operator
()(
ep
::
Stream
*
stream
,
const
T
*
input_buf
,
T
*
output_buf
,
const
int64_t
*
dest_dims
,
const
int32_t
*
stride
,
const
int32_t
dest_num_dims
,
const
int32_t
storage_offset
,
const
int32_t
input_num
,
const
int32_t
output_num
)
{
NdIndexOffsetHelper
<
int64_t
,
NUM_DIM
>
destIndexOffsetHelper
(
dest_dims
,
dest_num_dims
);
AsStridedParams
<
NUM_DIM
,
int64_t
>
params
;
params
.
destIndexOffsetHelper
=
destIndexOffsetHelper
;
FOR_RANGE
(
size_t
,
i
,
0
,
dest_num_dims
)
{
params
.
dest_dims
[
i
]
=
dest_dims
[
i
];
params
.
stride
[
i
]
=
stride
[
i
];
}
params
.
dest_num_dims
=
dest_num_dims
;
params
.
storage_offset
=
storage_offset
;
params
.
input_num
=
input_num
;
params
.
output_num
=
output_num
;
AsStrided_kernel
<
T
>
<<<
BlocksNum4ThreadsNum
(
output_num
),
kCudaThreadsNumPerBlock
,
0
,
stream
->
As
<
ep
::
CudaStream
>
()
->
cuda_stream
()
>>>
(
input_buf
,
output_buf
,
params
);
}
};
template
<
typename
T
>
struct
AsStridedGradFunctor
final
{
void
operator
()(
ep
::
Stream
*
stream
,
const
T
*
dy_buf
,
T
*
dx_buf
,
const
int64_t
*
dy_dims
,
const
int32_t
*
stride
,
const
int32_t
dy_num_dims
,
const
int32_t
storage_offset
,
const
int32_t
dx_num
,
const
int32_t
dy_num
)
{
NdIndexOffsetHelper
<
int64_t
,
NUM_DIM
>
dyIndexOffsetHelper
(
dy_dims
,
dy_num_dims
);
AsStridedParams
<
NUM_DIM
,
int64_t
>
params
;
params
.
destIndexOffsetHelper
=
dyIndexOffsetHelper
;
FOR_RANGE
(
size_t
,
i
,
0
,
dy_num_dims
)
{
params
.
dest_dims
[
i
]
=
dy_dims
[
i
];
params
.
stride
[
i
]
=
stride
[
i
];
}
params
.
dest_num_dims
=
dy_num_dims
;
params
.
storage_offset
=
storage_offset
;
params
.
input_num
=
dx_num
;
params
.
output_num
=
dy_num
;
AsStridedGrad_kernel
<
T
>
<<<
BlocksNum4ThreadsNum
(
dy_num
),
kCudaThreadsNumPerBlock
,
0
,
stream
->
As
<
ep
::
CudaStream
>
()
->
cuda_stream
()
>>>
(
dy_buf
,
dx_buf
,
params
);
}
};
}
// namespace
template
<
typename
T
>
class
GpuAsStridedKernel
final
:
public
user_op
::
OpKernel
{
public:
GpuAsStridedKernel
()
=
default
;
~
GpuAsStridedKernel
()
=
default
;
private:
using
user_op
::
OpKernel
::
Compute
;
void
Compute
(
user_op
::
KernelComputeContext
*
ctx
)
const
override
{
const
user_op
::
Tensor
*
input
=
ctx
->
Tensor4ArgNameAndIndex
(
"input"
,
0
);
user_op
::
Tensor
*
output
=
ctx
->
Tensor4ArgNameAndIndex
(
"output"
,
0
);
const
auto
size
=
ctx
->
Attr
<
std
::
vector
<
int32_t
>>
(
"size"
);
const
auto
stride
=
ctx
->
Attr
<
std
::
vector
<
int32_t
>>
(
"stride"
);
const
int32_t
storage_offset
=
ctx
->
Attr
<
int32_t
>
(
"storage_offset"
);
size_t
dest_num_dims
=
output
->
shape_view
().
NumAxes
();
const
int64_t
*
dest_dims
=
output
->
shape_view
().
ptr
();
const
size_t
input_num
=
input
->
shape_view
().
Count
(
0
);
const
size_t
output_num
=
output
->
shape_view
().
Count
(
0
);
if
(
input_num
==
0
)
{
// 0-size tensor
return
;
}
AsStridedFunctor
<
T
>
()(
ctx
->
stream
(),
input
->
dptr
<
T
>
(),
output
->
mut_dptr
<
T
>
(),
dest_dims
,
stride
.
data
(),
dest_num_dims
,
storage_offset
,
input_num
,
output_num
);
}
bool
AlwaysComputeWhenAllOutputsEmpty
()
const
override
{
return
false
;
}
};
template
<
typename
T
>
class
GpuAsStridedGradKernel
final
:
public
user_op
::
OpKernel
{
public:
GpuAsStridedGradKernel
()
=
default
;
~
GpuAsStridedGradKernel
()
=
default
;
private:
using
user_op
::
OpKernel
::
Compute
;
void
Compute
(
user_op
::
KernelComputeContext
*
ctx
)
const
override
{
const
user_op
::
Tensor
*
dy
=
ctx
->
Tensor4ArgNameAndIndex
(
"dy"
,
0
);
user_op
::
Tensor
*
dx
=
ctx
->
Tensor4ArgNameAndIndex
(
"dx"
,
0
);
const
auto
size
=
ctx
->
Attr
<
std
::
vector
<
int32_t
>>
(
"size"
);
const
auto
stride
=
ctx
->
Attr
<
std
::
vector
<
int32_t
>>
(
"stride"
);
const
int32_t
storage_offset
=
ctx
->
Attr
<
int32_t
>
(
"storage_offset"
);
size_t
dy_num_dims
=
dy
->
shape_view
().
NumAxes
();
const
int64_t
*
dy_dims
=
dy
->
shape_view
().
ptr
();
const
size_t
dx_num
=
dx
->
shape_view
().
Count
(
0
);
const
size_t
dy_num
=
dy
->
shape_view
().
Count
(
0
);
Memset
<
DeviceType
::
kCUDA
>
(
ctx
->
stream
(),
dx
->
mut_dptr
(),
0
,
dx
->
shape_view
().
Count
(
0
)
*
sizeof
(
T
));
AsStridedGradFunctor
<
T
>
()(
ctx
->
stream
(),
dy
->
dptr
<
T
>
(),
dx
->
mut_dptr
<
T
>
(),
dy_dims
,
stride
.
data
(),
dy_num_dims
,
storage_offset
,
dx_num
,
dy_num
);
}
bool
AlwaysComputeWhenAllOutputsEmpty
()
const
override
{
return
false
;
}
};
#define REGISTER_GPUASSTRIDED_KERNEL(in_type) \
REGISTER_USER_KERNEL
(
"as_strided"
)
\
.
SetCreateFn
<
GpuAsStridedKernel
<
in_type
>>
()
\
.
SetIsMatchedHob
((
user_op
::
HobDeviceType
()
==
DeviceType
::
kCUDA
)
\
&&
(
user_op
::
HobDataType
(
"input"
,
0
)
==
GetDataType
<
in_type
>::
value
));
\
REGISTER_USER_KERNEL
(
"as_strided_grad"
)
\
.
SetCreateFn
<
GpuAsStridedGradKernel
<
in_type
>>
()
\
.
SetIsMatchedHob
((
user_op
::
HobDeviceType
()
==
DeviceType
::
kCUDA
)
\
&&
(
user_op
::
HobDataType
(
"input"
,
0
)
==
GetDataType
<
in_type
>::
value
));
REGISTER_GPUASSTRIDED_KERNEL
(
half
);
REGISTER_GPUASSTRIDED_KERNEL
(
float
);
REGISTER_GPUASSTRIDED_KERNEL
(
double
);
REGISTER_GPUASSTRIDED_KERNEL
(
int64_t
);
#undef REGISTER_GPUASSTRIDED_KERNEL
#include "hip/hip_runtime.h"
/*
Copyright 2020 The OneFlow Authors. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
#include <cstdint>
#include "oneflow/core/hip/atomic.hip.h"
#include "oneflow/core/common/just.h"
#include "oneflow/core/common/util.h"
#include "oneflow/core/framework/consistency_check.h"
#include "oneflow/core/framework/framework.h"
#include "oneflow/core/kernel/new_kernel_util.h"
#include "oneflow/core/kernel/kernel_util.h"
#include "oneflow/core/ep/rocm/cuda_stream.h"
#include "oneflow/core/common/nd_index_offset_helper.h"
namespace
oneflow
{
namespace
{
constexpr
size_t
NUM_DIM
=
8
;
template
<
size_t
num_dims
,
typename
IndexType
>
struct
AsStridedParams
{
NdIndexOffsetHelper
<
IndexType
,
num_dims
>
destIndexOffsetHelper
;
int64_t
dest_dims
[
num_dims
];
int32_t
stride
[
num_dims
];
int32_t
dest_num_dims
;
int32_t
storage_offset
;
int32_t
input_num
;
int32_t
output_num
;
};
template
<
typename
T
>
__global__
void
AsStrided_kernel
(
const
T
*
input_buf
,
T
*
output_buf
,
AsStridedParams
<
NUM_DIM
,
int64_t
>
params
)
{
const
int64_t
*
dest_dims
=
reinterpret_cast
<
const
int64_t
*>
(
params
.
dest_dims
);
const
int32_t
*
stride
=
reinterpret_cast
<
const
int32_t
*>
(
params
.
stride
);
CUDA_1D_KERNEL_LOOP_T
(
int64_t
,
i
,
params
.
output_num
)
{
int64_t
dst_index
[
NUM_DIM
];
params
.
destIndexOffsetHelper
.
OffsetToNdIndex
(
i
,
dst_index
,
params
.
dest_num_dims
);
int32_t
index_in_input
=
params
.
storage_offset
;
FOR_RANGE
(
int64_t
,
j
,
0
,
params
.
dest_num_dims
)
{
index_in_input
+=
dst_index
[
j
]
*
stride
[
j
];
}
output_buf
[
i
]
=
input_buf
[
index_in_input
];
}
}
template
<
typename
T
>
__global__
void
AsStridedGrad_kernel
(
const
T
*
dy_buf
,
T
*
dx_buf
,
AsStridedParams
<
NUM_DIM
,
int64_t
>
params
)
{
const
int64_t
*
dest_dims
=
reinterpret_cast
<
const
int64_t
*>
(
params
.
dest_dims
);
const
int32_t
*
stride
=
reinterpret_cast
<
const
int32_t
*>
(
params
.
stride
);
CUDA_1D_KERNEL_LOOP_T
(
int64_t
,
i
,
params
.
output_num
)
{
int64_t
dy_index
[
NUM_DIM
];
params
.
destIndexOffsetHelper
.
OffsetToNdIndex
(
i
,
dy_index
,
params
.
dest_num_dims
);
int32_t
index_in_dx
=
params
.
storage_offset
;
FOR_RANGE
(
int64_t
,
j
,
0
,
params
.
dest_num_dims
)
{
index_in_dx
+=
dy_index
[
j
]
*
stride
[
j
];
}
cuda
::
atomic
::
Add
(
dx_buf
+
index_in_dx
,
dy_buf
[
i
]);
}
}
template
<
typename
T
>
struct
AsStridedFunctor
final
{
void
operator
()(
ep
::
Stream
*
stream
,
const
T
*
input_buf
,
T
*
output_buf
,
const
int64_t
*
dest_dims
,
const
int32_t
*
stride
,
const
int32_t
dest_num_dims
,
const
int32_t
storage_offset
,
const
int32_t
input_num
,
const
int32_t
output_num
)
{
NdIndexOffsetHelper
<
int64_t
,
NUM_DIM
>
destIndexOffsetHelper
(
dest_dims
,
dest_num_dims
);
AsStridedParams
<
NUM_DIM
,
int64_t
>
params
;
params
.
destIndexOffsetHelper
=
destIndexOffsetHelper
;
FOR_RANGE
(
size_t
,
i
,
0
,
dest_num_dims
)
{
params
.
dest_dims
[
i
]
=
dest_dims
[
i
];
params
.
stride
[
i
]
=
stride
[
i
];
}
params
.
dest_num_dims
=
dest_num_dims
;
params
.
storage_offset
=
storage_offset
;
params
.
input_num
=
input_num
;
params
.
output_num
=
output_num
;
AsStrided_kernel
<
T
>
<<<
BlocksNum4ThreadsNum
(
output_num
),
kCudaThreadsNumPerBlock
,
0
,
stream
->
As
<
ep
::
CudaStream
>
()
->
cuda_stream
()
>>>
(
input_buf
,
output_buf
,
params
);
}
};
template
<
typename
T
>
struct
AsStridedGradFunctor
final
{
void
operator
()(
ep
::
Stream
*
stream
,
const
T
*
dy_buf
,
T
*
dx_buf
,
const
int64_t
*
dy_dims
,
const
int32_t
*
stride
,
const
int32_t
dy_num_dims
,
const
int32_t
storage_offset
,
const
int32_t
dx_num
,
const
int32_t
dy_num
)
{
NdIndexOffsetHelper
<
int64_t
,
NUM_DIM
>
dyIndexOffsetHelper
(
dy_dims
,
dy_num_dims
);
AsStridedParams
<
NUM_DIM
,
int64_t
>
params
;
params
.
destIndexOffsetHelper
=
dyIndexOffsetHelper
;
FOR_RANGE
(
size_t
,
i
,
0
,
dy_num_dims
)
{
params
.
dest_dims
[
i
]
=
dy_dims
[
i
];
params
.
stride
[
i
]
=
stride
[
i
];
}
params
.
dest_num_dims
=
dy_num_dims
;
params
.
storage_offset
=
storage_offset
;
params
.
input_num
=
dx_num
;
params
.
output_num
=
dy_num
;
AsStridedGrad_kernel
<
T
>
<<<
BlocksNum4ThreadsNum
(
dy_num
),
kCudaThreadsNumPerBlock
,
0
,
stream
->
As
<
ep
::
CudaStream
>
()
->
cuda_stream
()
>>>
(
dy_buf
,
dx_buf
,
params
);
}
};
}
// namespace
template
<
typename
T
>
class
GpuAsStridedKernel
final
:
public
user_op
::
OpKernel
{
public:
GpuAsStridedKernel
()
=
default
;
~
GpuAsStridedKernel
()
=
default
;
private:
using
user_op
::
OpKernel
::
Compute
;
void
Compute
(
user_op
::
KernelComputeContext
*
ctx
)
const
override
{
const
user_op
::
Tensor
*
input
=
ctx
->
Tensor4ArgNameAndIndex
(
"input"
,
0
);
user_op
::
Tensor
*
output
=
ctx
->
Tensor4ArgNameAndIndex
(
"output"
,
0
);
const
auto
size
=
ctx
->
Attr
<
std
::
vector
<
int32_t
>>
(
"size"
);
const
auto
stride
=
ctx
->
Attr
<
std
::
vector
<
int32_t
>>
(
"stride"
);
const
int32_t
storage_offset
=
ctx
->
Attr
<
int32_t
>
(
"storage_offset"
);
size_t
dest_num_dims
=
output
->
shape_view
().
NumAxes
();
const
int64_t
*
dest_dims
=
output
->
shape_view
().
ptr
();
const
size_t
input_num
=
input
->
shape_view
().
Count
(
0
);
const
size_t
output_num
=
output
->
shape_view
().
Count
(
0
);
if
(
input_num
==
0
)
{
// 0-size tensor
return
;
}
AsStridedFunctor
<
T
>
()(
ctx
->
stream
(),
input
->
dptr
<
T
>
(),
output
->
mut_dptr
<
T
>
(),
dest_dims
,
stride
.
data
(),
dest_num_dims
,
storage_offset
,
input_num
,
output_num
);
}
bool
AlwaysComputeWhenAllOutputsEmpty
()
const
override
{
return
false
;
}
};
template
<
typename
T
>
class
GpuAsStridedGradKernel
final
:
public
user_op
::
OpKernel
{
public:
GpuAsStridedGradKernel
()
=
default
;
~
GpuAsStridedGradKernel
()
=
default
;
private:
using
user_op
::
OpKernel
::
Compute
;
void
Compute
(
user_op
::
KernelComputeContext
*
ctx
)
const
override
{
const
user_op
::
Tensor
*
dy
=
ctx
->
Tensor4ArgNameAndIndex
(
"dy"
,
0
);
user_op
::
Tensor
*
dx
=
ctx
->
Tensor4ArgNameAndIndex
(
"dx"
,
0
);
const
auto
size
=
ctx
->
Attr
<
std
::
vector
<
int32_t
>>
(
"size"
);
const
auto
stride
=
ctx
->
Attr
<
std
::
vector
<
int32_t
>>
(
"stride"
);
const
int32_t
storage_offset
=
ctx
->
Attr
<
int32_t
>
(
"storage_offset"
);
size_t
dy_num_dims
=
dy
->
shape_view
().
NumAxes
();
const
int64_t
*
dy_dims
=
dy
->
shape_view
().
ptr
();
const
size_t
dx_num
=
dx
->
shape_view
().
Count
(
0
);
const
size_t
dy_num
=
dy
->
shape_view
().
Count
(
0
);
Memset
<
DeviceType
::
kCUDA
>
(
ctx
->
stream
(),
dx
->
mut_dptr
(),
0
,
dx
->
shape_view
().
Count
(
0
)
*
sizeof
(
T
));
AsStridedGradFunctor
<
T
>
()(
ctx
->
stream
(),
dy
->
dptr
<
T
>
(),
dx
->
mut_dptr
<
T
>
(),
dy_dims
,
stride
.
data
(),
dy_num_dims
,
storage_offset
,
dx_num
,
dy_num
);
}
bool
AlwaysComputeWhenAllOutputsEmpty
()
const
override
{
return
false
;
}
};
#define REGISTER_GPUASSTRIDED_KERNEL(in_type) \
REGISTER_USER_KERNEL("as_strided") \
.SetCreateFn<GpuAsStridedKernel<in_type>>() \
.SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \
&& (user_op::HobDataType("input", 0) == GetDataType<in_type>::value)); \
REGISTER_USER_KERNEL("as_strided_grad") \
.SetCreateFn<GpuAsStridedGradKernel<in_type>>() \
.SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \
&& (user_op::HobDataType("input", 0) == GetDataType<in_type>::value));
REGISTER_GPUASSTRIDED_KERNEL
(
half
);
REGISTER_GPUASSTRIDED_KERNEL
(
float
);
REGISTER_GPUASSTRIDED_KERNEL
(
double
);
REGISTER_GPUASSTRIDED_KERNEL
(
int64_t
);
#undef REGISTER_GPUASSTRIDED_KERNEL
}
// namespace oneflow
\ No newline at end of file
oneflow/user/kernels/assign_if_kernel.hip.cpp
View file @
8f7de847
/*
Copyright 2020 The OneFlow Authors. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
#include "hip/hip_runtime.h"
#include "oneflow/core/framework/framework.h"
#include "oneflow/core/kernel/kernel_util.h"
#include "oneflow/core/ep/rocm/cuda_stream.h"
namespace
oneflow
{
namespace
{
template
<
bool
assign_if
,
typename
C
,
typename
T
>
__global__
void
AssignGpu
(
int64_t
elem_cnt
,
const
C
*
condition
,
const
T
*
value
,
T
*
ref
)
{
if
(
assign_if
==
(
*
condition
==
0
))
{
return
;
}
CUDA_1D_KERNEL_LOOP
(
i
,
elem_cnt
)
{
ref
[
i
]
=
value
[
i
];
}
}
template
<
bool
assign_if
,
typename
C
,
typename
T
>
class
AssignIfGPUKernel
final
:
public
user_op
::
OpKernel
{
public:
AssignIfGPUKernel
()
=
default
;
~
AssignIfGPUKernel
()
override
=
default
;
private:
using
user_op
::
OpKernel
::
Compute
;
void
Compute
(
user_op
::
KernelComputeContext
*
ctx
)
const
override
{
const
user_op
::
Tensor
*
condition
=
ctx
->
Tensor4ArgNameAndIndex
(
"condition"
,
0
);
CHECK_EQ
(
condition
->
shape_view
().
NumAxes
(),
1
);
CHECK_EQ
(
condition
->
shape_view
().
At
(
0
),
1
);
const
user_op
::
Tensor
*
value
=
ctx
->
Tensor4ArgNameAndIndex
(
"value"
,
0
);
user_op
::
Tensor
*
ref
=
ctx
->
Tensor4ArgNameAndIndex
(
"ref"
,
0
);
if
(
value
->
dptr
()
==
ref
->
dptr
())
{
return
;
}
CHECK_EQ
(
value
->
shape_view
(),
ref
->
shape_view
());
CHECK_EQ
(
value
->
data_type
(),
ref
->
data_type
());
const
size_t
elem_cnt
=
ref
->
shape_view
().
elem_cnt
();
AssignGpu
<
assign_if
,
C
,
T
><<<
BlocksNum4ThreadsNum
(
elem_cnt
),
kCudaThreadsNumPerBlock
,
0
,
ctx
->
stream
()
->
As
<
ep
::
CudaStream
>
()
->
cuda_stream
()
>>>
(
elem_cnt
,
condition
->
dptr
<
C
>
(),
value
->
dptr
<
T
>
(),
ref
->
mut_dptr
<
T
>
());
}
bool
AlwaysComputeWhenAllOutputsEmpty
()
const
override
{
return
true
;
}
};
}
// namespace
#define REGISTER_ASSIGN_WITH_CONDITION_VALUE_CUDA_KERNEL(op_type_name, assign_if, condition_type, \
value_type
)
\
REGISTER_USER_KERNEL
(
op_type_name
)
\
.
SetCreateFn
<
AssignIfGPUKernel
<
assign_if
,
condition_type
,
value_type
>>
()
\
.
SetIsMatchedHob
(
\
(
user_op
::
HobDeviceType
()
==
DeviceType
::
kCUDA
)
\
&&
(
user_op
::
HobDataType
(
"condition"
,
0
)
==
GetDataType
<
condition_type
>::
value
)
\
&&
(
user_op
::
HobDataType
(
"value"
,
0
)
==
GetDataType
<
value_type
>::
value
));
#define REGISTER_ASSIGN_IF_CUDA_KERNEL(condition_type, value_type) \
REGISTER_ASSIGN_WITH_CONDITION_VALUE_CUDA_KERNEL
(
\
"assign_if"
,
true
,
OF_PP_PAIR_FIRST
(
condition_type
),
OF_PP_PAIR_FIRST
(
value_type
));
\
REGISTER_ASSIGN_WITH_CONDITION_VALUE_CUDA_KERNEL
(
\
"assign_if_not"
,
false
,
OF_PP_PAIR_FIRST
(
condition_type
),
OF_PP_PAIR_FIRST
(
value_type
))
OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE
(
REGISTER_ASSIGN_IF_CUDA_KERNEL
,
INT_DATA_TYPE_SEQ
,
POD_DATA_TYPE_SEQ
)
/*
Copyright 2020 The OneFlow Authors. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
#include "hip/hip_runtime.h"
#include "oneflow/core/framework/framework.h"
#include "oneflow/core/kernel/kernel_util.h"
#include "oneflow/core/ep/rocm/cuda_stream.h"
namespace
oneflow
{
namespace
{
template
<
bool
assign_if
,
typename
C
,
typename
T
>
__global__
void
AssignGpu
(
int64_t
elem_cnt
,
const
C
*
condition
,
const
T
*
value
,
T
*
ref
)
{
if
(
assign_if
==
(
*
condition
==
0
))
{
return
;
}
CUDA_1D_KERNEL_LOOP
(
i
,
elem_cnt
)
{
ref
[
i
]
=
value
[
i
];
}
}
template
<
bool
assign_if
,
typename
C
,
typename
T
>
class
AssignIfGPUKernel
final
:
public
user_op
::
OpKernel
{
public:
AssignIfGPUKernel
()
=
default
;
~
AssignIfGPUKernel
()
override
=
default
;
private:
using
user_op
::
OpKernel
::
Compute
;
void
Compute
(
user_op
::
KernelComputeContext
*
ctx
)
const
override
{
const
user_op
::
Tensor
*
condition
=
ctx
->
Tensor4ArgNameAndIndex
(
"condition"
,
0
);
CHECK_EQ
(
condition
->
shape_view
().
NumAxes
(),
1
);
CHECK_EQ
(
condition
->
shape_view
().
At
(
0
),
1
);
const
user_op
::
Tensor
*
value
=
ctx
->
Tensor4ArgNameAndIndex
(
"value"
,
0
);
user_op
::
Tensor
*
ref
=
ctx
->
Tensor4ArgNameAndIndex
(
"ref"
,
0
);
if
(
value
->
dptr
()
==
ref
->
dptr
())
{
return
;
}
CHECK_EQ
(
value
->
shape_view
(),
ref
->
shape_view
());
CHECK_EQ
(
value
->
data_type
(),
ref
->
data_type
());
const
size_t
elem_cnt
=
ref
->
shape_view
().
elem_cnt
();
AssignGpu
<
assign_if
,
C
,
T
><<<
BlocksNum4ThreadsNum
(
elem_cnt
),
kCudaThreadsNumPerBlock
,
0
,
ctx
->
stream
()
->
As
<
ep
::
CudaStream
>
()
->
cuda_stream
()
>>>
(
elem_cnt
,
condition
->
dptr
<
C
>
(),
value
->
dptr
<
T
>
(),
ref
->
mut_dptr
<
T
>
());
}
bool
AlwaysComputeWhenAllOutputsEmpty
()
const
override
{
return
true
;
}
};
}
// namespace
#define REGISTER_ASSIGN_WITH_CONDITION_VALUE_CUDA_KERNEL(op_type_name, assign_if, condition_type, \
value_type) \
REGISTER_USER_KERNEL(op_type_name) \
.SetCreateFn<AssignIfGPUKernel<assign_if, condition_type, value_type>>() \
.SetIsMatchedHob( \
(user_op::HobDeviceType() == DeviceType::kCUDA) \
&& (user_op::HobDataType("condition", 0) == GetDataType<condition_type>::value) \
&& (user_op::HobDataType("value", 0) == GetDataType<value_type>::value));
#define REGISTER_ASSIGN_IF_CUDA_KERNEL(condition_type, value_type) \
REGISTER_ASSIGN_WITH_CONDITION_VALUE_CUDA_KERNEL( \
"assign_if", true, OF_PP_PAIR_FIRST(condition_type), OF_PP_PAIR_FIRST(value_type)); \
REGISTER_ASSIGN_WITH_CONDITION_VALUE_CUDA_KERNEL( \
"assign_if_not", false, OF_PP_PAIR_FIRST(condition_type), OF_PP_PAIR_FIRST(value_type))
OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE
(
REGISTER_ASSIGN_IF_CUDA_KERNEL
,
INT_DATA_TYPE_SEQ
,
POD_DATA_TYPE_SEQ
)
}
// namespace oneflow
\ No newline at end of file
oneflow/user/kernels/avg_pool_kernel.hip.cpp
View file @
8f7de847
/*
Copyright 2020 The OneFlow Authors. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
#include "hip/hip_runtime.h"
#include <cstdint>
#include "oneflow/core/hip/elementwise.hip.h"
#include "oneflow/user/kernels/avg_pool_kernel_util.h"
#include "oneflow/core/ep/rocm/cuda_stream.h"
namespace
oneflow
{
namespace
{
constexpr
int
kBlockSize
=
cuda
::
elementwise
::
kBlockSize
;
int
GetMinThreadNum
(
const
int64_t
elem_num
)
{
return
std
::
min
<
int64_t
>
(
elem_num
,
kBlockSize
);
}
int
GetNumBlocks
(
int32_t
elem_cnt
)
{
int
num_blocks
=
0
;
OF_CUDA_CHECK
(
cuda
::
elementwise
::
GetNumBlocks
(
elem_cnt
,
&
num_blocks
));
return
num_blocks
;
}
}
// namespace
template
<
typename
T
,
typename
IDX
>
__launch_bounds__
(
kBlockSize
)
__global__
void
DoCUDAAvgPool1dForward
(
const
NdIndexOffsetHelper
<
IDX
,
2
>
index_helper
,
IDX
elem_num
,
const
T
*
src
,
T
*
dest
,
int32_t
padding_l
,
const
int32_t
n_batch
,
const
int32_t
n_channel
,
const
int32_t
x_length
,
const
int32_t
kernel_size_l
,
const
int32_t
stride_l
,
const
bool
count_include_pad
,
const
int32_t
divisor_override
)
{
Avgpool1dForwardCompute
<
T
>
(
index_helper
,
elem_num
,
src
,
dest
,
padding_l
,
n_batch
,
n_channel
,
x_length
,
kernel_size_l
,
stride_l
,
count_include_pad
,
divisor_override
);
};
template
<
typename
T
,
typename
IDX
>
__launch_bounds__
(
kBlockSize
)
__global__
void
DoCUDAAvgPool2dForward
(
const
NdIndexOffsetHelper
<
IDX
,
3
>
index_helper
,
IDX
elem_num
,
const
T
*
src
,
T
*
dest
,
const
int32_t
padding_h
,
const
int32_t
padding_w
,
const
int32_t
n_batch
,
const
int32_t
n_channel
,
const
int32_t
x_height
,
const
int32_t
x_width
,
const
int32_t
kernel_size_h
,
const
int32_t
kernel_size_w
,
const
int32_t
stride_h
,
const
int32_t
stride_w
,
const
bool
count_include_pad
,
const
int32_t
divisor_override
)
{
Avgpool2dForwardCompute
<
T
>
(
index_helper
,
elem_num
,
src
,
dest
,
padding_h
,
padding_w
,
n_batch
,
n_channel
,
x_height
,
x_width
,
kernel_size_h
,
kernel_size_w
,
stride_h
,
stride_w
,
count_include_pad
,
divisor_override
);
};
template
<
typename
T
,
typename
IDX
>
__launch_bounds__
(
kBlockSize
)
__global__
void
DoCUDAAvgPool3dForward
(
const
NdIndexOffsetHelper
<
IDX
,
4
>
index_helper
,
IDX
elem_num
,
const
T
*
src
,
T
*
dest
,
int32_t
padding_t
,
const
int32_t
padding_h
,
const
int32_t
padding_w
,
const
int32_t
n_batch
,
const
int32_t
n_channel
,
const
int32_t
x_time
,
const
int32_t
x_height
,
const
int32_t
x_width
,
const
int32_t
kernel_size_t
,
int32_t
kernel_size_h
,
const
int32_t
kernel_size_w
,
const
int32_t
stride_t
,
const
int32_t
stride_h
,
const
int32_t
stride_w
,
const
bool
count_include_pad
,
const
int32_t
divisor_override
)
{
Avgpool3dForwardCompute
<
T
>
(
index_helper
,
elem_num
,
src
,
dest
,
padding_t
,
padding_h
,
padding_w
,
n_batch
,
n_channel
,
x_time
,
x_height
,
x_width
,
kernel_size_t
,
kernel_size_h
,
kernel_size_w
,
stride_t
,
stride_h
,
stride_w
,
count_include_pad
,
divisor_override
);
};
template
<
typename
T
,
typename
IDX
>
__launch_bounds__
(
kBlockSize
)
__global__
void
DoCUDAAvgPool1dBackward
(
const
NdIndexOffsetHelper
<
IDX
,
2
>
index_helper
,
IDX
elem_num
,
const
T
*
src
,
T
*
dest
,
const
int32_t
padding_l
,
const
int32_t
n_batch
,
const
int32_t
n_channel
,
const
int32_t
input_length
,
const
int32_t
kernel_size_l
,
const
int32_t
stride_l
,
const
bool
count_include_pad
,
const
int32_t
divisor_override
)
{
Avgpool1dBackwardCompute
<
T
>
(
index_helper
,
elem_num
,
src
,
dest
,
padding_l
,
n_batch
,
n_channel
,
input_length
,
kernel_size_l
,
stride_l
,
count_include_pad
,
divisor_override
);
};
template
<
typename
T
,
typename
IDX
>
__launch_bounds__
(
kBlockSize
)
__global__
void
DoCUDAAvgPool2dBackward
(
const
NdIndexOffsetHelper
<
IDX
,
3
>
index_helper
,
IDX
elem_num
,
const
T
*
src
,
T
*
dest
,
const
int32_t
padding_h
,
const
int32_t
padding_w
,
const
int32_t
n_batch
,
const
int32_t
n_channel
,
const
int32_t
input_height
,
const
int32_t
input_width
,
const
int32_t
kernel_size_h
,
const
int32_t
kernel_size_w
,
const
int32_t
stride_h
,
const
int32_t
stride_w
,
const
bool
count_include_pad
,
int32_t
divisor_override
)
{
Avgpool2dBackwardCompute
<
T
>
(
index_helper
,
elem_num
,
src
,
dest
,
padding_h
,
padding_w
,
n_batch
,
n_channel
,
input_height
,
input_width
,
kernel_size_h
,
kernel_size_w
,
stride_h
,
stride_w
,
count_include_pad
,
divisor_override
);
};
template
<
typename
T
,
typename
IDX
>
__launch_bounds__
(
kBlockSize
)
__global__
void
DoCUDAAvgPool3dBackward
(
const
NdIndexOffsetHelper
<
IDX
,
4
>
index_helper
,
IDX
elem_num
,
const
T
*
src
,
T
*
dest
,
const
int32_t
padding_t
,
const
int32_t
padding_h
,
const
int32_t
padding_w
,
const
int32_t
n_batch
,
const
int32_t
n_channel
,
const
int32_t
x_time
,
const
int32_t
x_height
,
const
int32_t
x_width
,
const
int32_t
kernel_size_t
,
const
int32_t
kernel_size_h
,
const
int32_t
kernel_size_w
,
const
int32_t
stride_t
,
const
int32_t
stride_h
,
const
int32_t
stride_w
,
const
bool
count_include_pad
,
const
int32_t
divisor_override
)
{
Avgpool3dBackwardCompute
<
T
>
(
index_helper
,
elem_num
,
src
,
dest
,
padding_t
,
padding_h
,
padding_w
,
n_batch
,
n_channel
,
x_time
,
x_height
,
x_width
,
kernel_size_t
,
kernel_size_h
,
kernel_size_w
,
stride_t
,
stride_h
,
stride_w
,
count_include_pad
,
divisor_override
);
};
template
<
typename
T
,
typename
IDX
>
struct
AvgPoolKernelUtil
<
DeviceType
::
kCUDA
,
T
,
IDX
>
{
static
void
Avgpool1dForward
(
ep
::
Stream
*
stream
,
const
NdIndexOffsetHelper
<
IDX
,
2
>&
index_helper
,
const
IDX
elem_num
,
const
T
*
src
,
T
*
dest
,
const
AvgPoolParams3D
&
params_3d
)
{
DoCUDAAvgPool1dForward
<
T
,
IDX
><<<
GetNumBlocks
(
elem_num
),
GetMinThreadNum
(
elem_num
),
0
,
stream
->
As
<
ep
::
CudaStream
>
()
->
cuda_stream
()
>>>
(
index_helper
,
elem_num
,
src
,
dest
,
params_3d
.
padding
()[
2
],
params_3d
.
num_batch
(),
params_3d
.
num_channel
(),
params_3d
.
GetXShape5D
().
At
(
4
),
params_3d
.
pool_size_3d
()[
2
],
params_3d
.
stride_3d
()[
2
],
params_3d
.
count_include_pad
(),
params_3d
.
divisor_override
());
}
static
void
Avgpool1dBackward
(
ep
::
Stream
*
stream
,
const
NdIndexOffsetHelper
<
IDX
,
2
>&
index_helper
,
const
IDX
elem_num
,
const
T
*
src
,
T
*
dest
,
const
AvgPoolParams3D
&
params_3d
)
{
DoCUDAAvgPool1dBackward
<
T
,
IDX
><<<
GetNumBlocks
(
elem_num
),
GetMinThreadNum
(
elem_num
),
0
,
stream
->
As
<
ep
::
CudaStream
>
()
->
cuda_stream
()
>>>
(
index_helper
,
elem_num
,
src
,
dest
,
params_3d
.
padding
()[
2
],
params_3d
.
num_batch
(),
params_3d
.
num_channel
(),
params_3d
.
GetXShape5D
().
At
(
4
),
params_3d
.
pool_size_3d
()[
2
],
params_3d
.
stride_3d
()[
2
],
params_3d
.
count_include_pad
(),
params_3d
.
divisor_override
());
}
static
void
Avgpool2dForward
(
ep
::
Stream
*
stream
,
const
NdIndexOffsetHelper
<
IDX
,
3
>&
index_helper
,
const
IDX
elem_num
,
const
T
*
src
,
T
*
dest
,
const
AvgPoolParams3D
&
params_3d
)
{
DoCUDAAvgPool2dForward
<
T
,
IDX
><<<
GetNumBlocks
(
elem_num
),
GetMinThreadNum
(
elem_num
),
0
,
stream
->
As
<
ep
::
CudaStream
>
()
->
cuda_stream
()
>>>
(
index_helper
,
elem_num
,
src
,
dest
,
params_3d
.
padding
()[
1
],
params_3d
.
padding
()[
2
],
params_3d
.
num_batch
(),
params_3d
.
num_channel
(),
params_3d
.
GetXShape5D
().
At
(
3
),
params_3d
.
GetXShape5D
().
At
(
4
),
params_3d
.
pool_size_3d
()[
1
],
params_3d
.
pool_size_3d
()[
2
],
params_3d
.
stride_3d
()[
1
],
params_3d
.
stride_3d
()[
2
],
params_3d
.
count_include_pad
(),
params_3d
.
divisor_override
());
}
static
void
Avgpool2dBackward
(
ep
::
Stream
*
stream
,
const
NdIndexOffsetHelper
<
IDX
,
3
>&
index_helper
,
const
IDX
elem_num
,
const
T
*
src
,
T
*
dest
,
const
AvgPoolParams3D
&
params_3d
)
{
DoCUDAAvgPool2dBackward
<
T
,
IDX
><<<
GetNumBlocks
(
elem_num
),
GetMinThreadNum
(
elem_num
),
0
,
stream
->
As
<
ep
::
CudaStream
>
()
->
cuda_stream
()
>>>
(
index_helper
,
elem_num
,
src
,
dest
,
params_3d
.
padding
()[
1
],
params_3d
.
padding
()[
2
],
params_3d
.
num_batch
(),
params_3d
.
num_channel
(),
params_3d
.
GetXShape5D
().
At
(
3
),
params_3d
.
GetXShape5D
().
At
(
4
),
params_3d
.
pool_size_3d
()[
1
],
params_3d
.
pool_size_3d
()[
2
],
params_3d
.
stride_3d
()[
1
],
params_3d
.
stride_3d
()[
2
],
params_3d
.
count_include_pad
(),
params_3d
.
divisor_override
());
}
static
void
Avgpool3dForward
(
ep
::
Stream
*
stream
,
const
NdIndexOffsetHelper
<
IDX
,
4
>&
index_helper
,
const
IDX
elem_num
,
const
T
*
src
,
T
*
dest
,
const
AvgPoolParams3D
&
params_3d
)
{
DoCUDAAvgPool3dForward
<
T
,
IDX
><<<
GetNumBlocks
(
elem_num
),
GetMinThreadNum
(
elem_num
),
0
,
stream
->
As
<
ep
::
CudaStream
>
()
->
cuda_stream
()
>>>
(
index_helper
,
elem_num
,
src
,
dest
,
params_3d
.
padding
()[
0
],
params_3d
.
padding
()[
1
],
params_3d
.
padding
()[
2
],
params_3d
.
num_batch
(),
params_3d
.
num_channel
(),
params_3d
.
GetXShape5D
().
At
(
2
),
params_3d
.
GetXShape5D
().
At
(
3
),
params_3d
.
GetXShape5D
().
At
(
4
),
params_3d
.
pool_size_3d
()[
0
],
params_3d
.
pool_size_3d
()[
1
],
params_3d
.
pool_size_3d
()[
2
],
params_3d
.
stride_3d
()[
0
],
params_3d
.
stride_3d
()[
1
],
params_3d
.
stride_3d
()[
2
],
params_3d
.
count_include_pad
(),
params_3d
.
divisor_override
());
}
static
void
Avgpool3dBackward
(
ep
::
Stream
*
stream
,
const
NdIndexOffsetHelper
<
IDX
,
4
>&
index_helper
,
const
IDX
elem_num
,
const
T
*
src
,
T
*
dest
,
const
AvgPoolParams3D
&
params_3d
)
{
DoCUDAAvgPool3dBackward
<
T
,
IDX
><<<
GetNumBlocks
(
elem_num
),
GetMinThreadNum
(
elem_num
),
0
,
stream
->
As
<
ep
::
CudaStream
>
()
->
cuda_stream
()
>>>
(
index_helper
,
elem_num
,
src
,
dest
,
params_3d
.
padding
()[
0
],
params_3d
.
padding
()[
1
],
params_3d
.
padding
()[
2
],
params_3d
.
num_batch
(),
params_3d
.
num_channel
(),
params_3d
.
GetXShape5D
().
At
(
2
),
params_3d
.
GetXShape5D
().
At
(
3
),
params_3d
.
GetXShape5D
().
At
(
4
),
params_3d
.
pool_size_3d
()[
0
],
params_3d
.
pool_size_3d
()[
1
],
params_3d
.
pool_size_3d
()[
2
],
params_3d
.
stride_3d
()[
0
],
params_3d
.
stride_3d
()[
1
],
params_3d
.
stride_3d
()[
2
],
params_3d
.
count_include_pad
(),
params_3d
.
divisor_override
());
}
};
OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE
(
INSTANTIATE_AVG_POOL_KERNEL_UTIL
,
(
DeviceType
::
kCUDA
),
AVG_POOL_DATA_TYPE_CUDA_SEQ
,
AVG_POOL_IDX_DATA_TYPE_SEQ
);
/*
Copyright 2020 The OneFlow Authors. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
#include "hip/hip_runtime.h"
#include <cstdint>
#include "oneflow/core/hip/elementwise.hip.h"
#include "oneflow/user/kernels/avg_pool_kernel_util.h"
#include "oneflow/core/ep/rocm/cuda_stream.h"
namespace
oneflow
{
namespace
{
constexpr
int
kBlockSize
=
cuda
::
elementwise
::
kBlockSize
;
int
GetMinThreadNum
(
const
int64_t
elem_num
)
{
return
std
::
min
<
int64_t
>
(
elem_num
,
kBlockSize
);
}
int
GetNumBlocks
(
int32_t
elem_cnt
)
{
int
num_blocks
=
0
;
OF_CUDA_CHECK
(
cuda
::
elementwise
::
GetNumBlocks
(
elem_cnt
,
&
num_blocks
));
return
num_blocks
;
}
}
// namespace
template
<
typename
T
,
typename
IDX
>
__launch_bounds__
(
kBlockSize
)
__global__
void
DoCUDAAvgPool1dForward
(
const
NdIndexOffsetHelper
<
IDX
,
2
>
index_helper
,
IDX
elem_num
,
const
T
*
src
,
T
*
dest
,
int32_t
padding_l
,
const
int32_t
n_batch
,
const
int32_t
n_channel
,
const
int32_t
x_length
,
const
int32_t
kernel_size_l
,
const
int32_t
stride_l
,
const
bool
count_include_pad
,
const
int32_t
divisor_override
)
{
Avgpool1dForwardCompute
<
T
>
(
index_helper
,
elem_num
,
src
,
dest
,
padding_l
,
n_batch
,
n_channel
,
x_length
,
kernel_size_l
,
stride_l
,
count_include_pad
,
divisor_override
);
};
template
<
typename
T
,
typename
IDX
>
__launch_bounds__
(
kBlockSize
)
__global__
void
DoCUDAAvgPool2dForward
(
const
NdIndexOffsetHelper
<
IDX
,
3
>
index_helper
,
IDX
elem_num
,
const
T
*
src
,
T
*
dest
,
const
int32_t
padding_h
,
const
int32_t
padding_w
,
const
int32_t
n_batch
,
const
int32_t
n_channel
,
const
int32_t
x_height
,
const
int32_t
x_width
,
const
int32_t
kernel_size_h
,
const
int32_t
kernel_size_w
,
const
int32_t
stride_h
,
const
int32_t
stride_w
,
const
bool
count_include_pad
,
const
int32_t
divisor_override
)
{
Avgpool2dForwardCompute
<
T
>
(
index_helper
,
elem_num
,
src
,
dest
,
padding_h
,
padding_w
,
n_batch
,
n_channel
,
x_height
,
x_width
,
kernel_size_h
,
kernel_size_w
,
stride_h
,
stride_w
,
count_include_pad
,
divisor_override
);
};
template
<
typename
T
,
typename
IDX
>
__launch_bounds__
(
kBlockSize
)
__global__
void
DoCUDAAvgPool3dForward
(
const
NdIndexOffsetHelper
<
IDX
,
4
>
index_helper
,
IDX
elem_num
,
const
T
*
src
,
T
*
dest
,
int32_t
padding_t
,
const
int32_t
padding_h
,
const
int32_t
padding_w
,
const
int32_t
n_batch
,
const
int32_t
n_channel
,
const
int32_t
x_time
,
const
int32_t
x_height
,
const
int32_t
x_width
,
const
int32_t
kernel_size_t
,
int32_t
kernel_size_h
,
const
int32_t
kernel_size_w
,
const
int32_t
stride_t
,
const
int32_t
stride_h
,
const
int32_t
stride_w
,
const
bool
count_include_pad
,
const
int32_t
divisor_override
)
{
Avgpool3dForwardCompute
<
T
>
(
index_helper
,
elem_num
,
src
,
dest
,
padding_t
,
padding_h
,
padding_w
,
n_batch
,
n_channel
,
x_time
,
x_height
,
x_width
,
kernel_size_t
,
kernel_size_h
,
kernel_size_w
,
stride_t
,
stride_h
,
stride_w
,
count_include_pad
,
divisor_override
);
};
template
<
typename
T
,
typename
IDX
>
__launch_bounds__
(
kBlockSize
)
__global__
void
DoCUDAAvgPool1dBackward
(
const
NdIndexOffsetHelper
<
IDX
,
2
>
index_helper
,
IDX
elem_num
,
const
T
*
src
,
T
*
dest
,
const
int32_t
padding_l
,
const
int32_t
n_batch
,
const
int32_t
n_channel
,
const
int32_t
input_length
,
const
int32_t
kernel_size_l
,
const
int32_t
stride_l
,
const
bool
count_include_pad
,
const
int32_t
divisor_override
)
{
Avgpool1dBackwardCompute
<
T
>
(
index_helper
,
elem_num
,
src
,
dest
,
padding_l
,
n_batch
,
n_channel
,
input_length
,
kernel_size_l
,
stride_l
,
count_include_pad
,
divisor_override
);
};
template
<
typename
T
,
typename
IDX
>
__launch_bounds__
(
kBlockSize
)
__global__
void
DoCUDAAvgPool2dBackward
(
const
NdIndexOffsetHelper
<
IDX
,
3
>
index_helper
,
IDX
elem_num
,
const
T
*
src
,
T
*
dest
,
const
int32_t
padding_h
,
const
int32_t
padding_w
,
const
int32_t
n_batch
,
const
int32_t
n_channel
,
const
int32_t
input_height
,
const
int32_t
input_width
,
const
int32_t
kernel_size_h
,
const
int32_t
kernel_size_w
,
const
int32_t
stride_h
,
const
int32_t
stride_w
,
const
bool
count_include_pad
,
int32_t
divisor_override
)
{
Avgpool2dBackwardCompute
<
T
>
(
index_helper
,
elem_num
,
src
,
dest
,
padding_h
,
padding_w
,
n_batch
,
n_channel
,
input_height
,
input_width
,
kernel_size_h
,
kernel_size_w
,
stride_h
,
stride_w
,
count_include_pad
,
divisor_override
);
};
template
<
typename
T
,
typename
IDX
>
__launch_bounds__
(
kBlockSize
)
__global__
void
DoCUDAAvgPool3dBackward
(
const
NdIndexOffsetHelper
<
IDX
,
4
>
index_helper
,
IDX
elem_num
,
const
T
*
src
,
T
*
dest
,
const
int32_t
padding_t
,
const
int32_t
padding_h
,
const
int32_t
padding_w
,
const
int32_t
n_batch
,
const
int32_t
n_channel
,
const
int32_t
x_time
,
const
int32_t
x_height
,
const
int32_t
x_width
,
const
int32_t
kernel_size_t
,
const
int32_t
kernel_size_h
,
const
int32_t
kernel_size_w
,
const
int32_t
stride_t
,
const
int32_t
stride_h
,
const
int32_t
stride_w
,
const
bool
count_include_pad
,
const
int32_t
divisor_override
)
{
Avgpool3dBackwardCompute
<
T
>
(
index_helper
,
elem_num
,
src
,
dest
,
padding_t
,
padding_h
,
padding_w
,
n_batch
,
n_channel
,
x_time
,
x_height
,
x_width
,
kernel_size_t
,
kernel_size_h
,
kernel_size_w
,
stride_t
,
stride_h
,
stride_w
,
count_include_pad
,
divisor_override
);
};
template
<
typename
T
,
typename
IDX
>
struct
AvgPoolKernelUtil
<
DeviceType
::
kCUDA
,
T
,
IDX
>
{
static
void
Avgpool1dForward
(
ep
::
Stream
*
stream
,
const
NdIndexOffsetHelper
<
IDX
,
2
>&
index_helper
,
const
IDX
elem_num
,
const
T
*
src
,
T
*
dest
,
const
AvgPoolParams3D
&
params_3d
)
{
DoCUDAAvgPool1dForward
<
T
,
IDX
><<<
GetNumBlocks
(
elem_num
),
GetMinThreadNum
(
elem_num
),
0
,
stream
->
As
<
ep
::
CudaStream
>
()
->
cuda_stream
()
>>>
(
index_helper
,
elem_num
,
src
,
dest
,
params_3d
.
padding
()[
2
],
params_3d
.
num_batch
(),
params_3d
.
num_channel
(),
params_3d
.
GetXShape5D
().
At
(
4
),
params_3d
.
pool_size_3d
()[
2
],
params_3d
.
stride_3d
()[
2
],
params_3d
.
count_include_pad
(),
params_3d
.
divisor_override
());
}
static
void
Avgpool1dBackward
(
ep
::
Stream
*
stream
,
const
NdIndexOffsetHelper
<
IDX
,
2
>&
index_helper
,
const
IDX
elem_num
,
const
T
*
src
,
T
*
dest
,
const
AvgPoolParams3D
&
params_3d
)
{
DoCUDAAvgPool1dBackward
<
T
,
IDX
><<<
GetNumBlocks
(
elem_num
),
GetMinThreadNum
(
elem_num
),
0
,
stream
->
As
<
ep
::
CudaStream
>
()
->
cuda_stream
()
>>>
(
index_helper
,
elem_num
,
src
,
dest
,
params_3d
.
padding
()[
2
],
params_3d
.
num_batch
(),
params_3d
.
num_channel
(),
params_3d
.
GetXShape5D
().
At
(
4
),
params_3d
.
pool_size_3d
()[
2
],
params_3d
.
stride_3d
()[
2
],
params_3d
.
count_include_pad
(),
params_3d
.
divisor_override
());
}
static
void
Avgpool2dForward
(
ep
::
Stream
*
stream
,
const
NdIndexOffsetHelper
<
IDX
,
3
>&
index_helper
,
const
IDX
elem_num
,
const
T
*
src
,
T
*
dest
,
const
AvgPoolParams3D
&
params_3d
)
{
DoCUDAAvgPool2dForward
<
T
,
IDX
><<<
GetNumBlocks
(
elem_num
),
GetMinThreadNum
(
elem_num
),
0
,
stream
->
As
<
ep
::
CudaStream
>
()
->
cuda_stream
()
>>>
(
index_helper
,
elem_num
,
src
,
dest
,
params_3d
.
padding
()[
1
],
params_3d
.
padding
()[
2
],
params_3d
.
num_batch
(),
params_3d
.
num_channel
(),
params_3d
.
GetXShape5D
().
At
(
3
),
params_3d
.
GetXShape5D
().
At
(
4
),
params_3d
.
pool_size_3d
()[
1
],
params_3d
.
pool_size_3d
()[
2
],
params_3d
.
stride_3d
()[
1
],
params_3d
.
stride_3d
()[
2
],
params_3d
.
count_include_pad
(),
params_3d
.
divisor_override
());
}
static
void
Avgpool2dBackward
(
ep
::
Stream
*
stream
,
const
NdIndexOffsetHelper
<
IDX
,
3
>&
index_helper
,
const
IDX
elem_num
,
const
T
*
src
,
T
*
dest
,
const
AvgPoolParams3D
&
params_3d
)
{
DoCUDAAvgPool2dBackward
<
T
,
IDX
><<<
GetNumBlocks
(
elem_num
),
GetMinThreadNum
(
elem_num
),
0
,
stream
->
As
<
ep
::
CudaStream
>
()
->
cuda_stream
()
>>>
(
index_helper
,
elem_num
,
src
,
dest
,
params_3d
.
padding
()[
1
],
params_3d
.
padding
()[
2
],
params_3d
.
num_batch
(),
params_3d
.
num_channel
(),
params_3d
.
GetXShape5D
().
At
(
3
),
params_3d
.
GetXShape5D
().
At
(
4
),
params_3d
.
pool_size_3d
()[
1
],
params_3d
.
pool_size_3d
()[
2
],
params_3d
.
stride_3d
()[
1
],
params_3d
.
stride_3d
()[
2
],
params_3d
.
count_include_pad
(),
params_3d
.
divisor_override
());
}
static
void
Avgpool3dForward
(
ep
::
Stream
*
stream
,
const
NdIndexOffsetHelper
<
IDX
,
4
>&
index_helper
,
const
IDX
elem_num
,
const
T
*
src
,
T
*
dest
,
const
AvgPoolParams3D
&
params_3d
)
{
DoCUDAAvgPool3dForward
<
T
,
IDX
><<<
GetNumBlocks
(
elem_num
),
GetMinThreadNum
(
elem_num
),
0
,
stream
->
As
<
ep
::
CudaStream
>
()
->
cuda_stream
()
>>>
(
index_helper
,
elem_num
,
src
,
dest
,
params_3d
.
padding
()[
0
],
params_3d
.
padding
()[
1
],
params_3d
.
padding
()[
2
],
params_3d
.
num_batch
(),
params_3d
.
num_channel
(),
params_3d
.
GetXShape5D
().
At
(
2
),
params_3d
.
GetXShape5D
().
At
(
3
),
params_3d
.
GetXShape5D
().
At
(
4
),
params_3d
.
pool_size_3d
()[
0
],
params_3d
.
pool_size_3d
()[
1
],
params_3d
.
pool_size_3d
()[
2
],
params_3d
.
stride_3d
()[
0
],
params_3d
.
stride_3d
()[
1
],
params_3d
.
stride_3d
()[
2
],
params_3d
.
count_include_pad
(),
params_3d
.
divisor_override
());
}
static
void
Avgpool3dBackward
(
ep
::
Stream
*
stream
,
const
NdIndexOffsetHelper
<
IDX
,
4
>&
index_helper
,
const
IDX
elem_num
,
const
T
*
src
,
T
*
dest
,
const
AvgPoolParams3D
&
params_3d
)
{
DoCUDAAvgPool3dBackward
<
T
,
IDX
><<<
GetNumBlocks
(
elem_num
),
GetMinThreadNum
(
elem_num
),
0
,
stream
->
As
<
ep
::
CudaStream
>
()
->
cuda_stream
()
>>>
(
index_helper
,
elem_num
,
src
,
dest
,
params_3d
.
padding
()[
0
],
params_3d
.
padding
()[
1
],
params_3d
.
padding
()[
2
],
params_3d
.
num_batch
(),
params_3d
.
num_channel
(),
params_3d
.
GetXShape5D
().
At
(
2
),
params_3d
.
GetXShape5D
().
At
(
3
),
params_3d
.
GetXShape5D
().
At
(
4
),
params_3d
.
pool_size_3d
()[
0
],
params_3d
.
pool_size_3d
()[
1
],
params_3d
.
pool_size_3d
()[
2
],
params_3d
.
stride_3d
()[
0
],
params_3d
.
stride_3d
()[
1
],
params_3d
.
stride_3d
()[
2
],
params_3d
.
count_include_pad
(),
params_3d
.
divisor_override
());
}
};
OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE
(
INSTANTIATE_AVG_POOL_KERNEL_UTIL
,
(
DeviceType
::
kCUDA
),
AVG_POOL_DATA_TYPE_CUDA_SEQ
,
AVG_POOL_IDX_DATA_TYPE_SEQ
);
}
// namespace oneflow
\ No newline at end of file
oneflow/user/kernels/batch_gather_kernel_util.hip.cpp
View file @
8f7de847
/*
Copyright 2020 The OneFlow Authors. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
#include "hip/hip_runtime.h"
#include "oneflow/user/kernels/batch_gather_kernel_util.h"
#include "oneflow/core/hip/atomic.hip.h"
#include "oneflow/core/ep/rocm/cuda_stream.h"
#include <assert.h>
namespace
oneflow
{
namespace
{
template
<
typename
K
>
__device__
int64_t
GetInOffset
(
const
int64_t
out_offset
,
const
K
*
indices
,
const
int64_t
indices_num
,
const
int64_t
instance_size
,
const
int64_t
gather_dim_size
)
{
const
int64_t
batch_idx
=
out_offset
/
(
indices_num
*
instance_size
);
const
int64_t
indices_idx
=
out_offset
%
(
indices_num
*
instance_size
)
/
instance_size
;
const
int64_t
inner_idx
=
out_offset
%
instance_size
;
const
int64_t
idx
=
indices
[
batch_idx
*
indices_num
+
indices_idx
];
assert
(
idx
>=
0
&&
idx
<
gather_dim_size
);
return
batch_idx
*
gather_dim_size
*
instance_size
+
idx
*
instance_size
+
inner_idx
;
}
template
<
typename
T
,
typename
K
>
__global__
void
BatchGatherForwardGpu
(
const
int64_t
elem_cnt
,
const
T
*
in
,
const
K
*
indices
,
const
int64_t
indices_num
,
const
int64_t
instance_size
,
const
int64_t
gather_dim_size
,
T
*
out
)
{
CUDA_1D_KERNEL_LOOP
(
i
,
elem_cnt
)
{
out
[
i
]
=
in
[
GetInOffset
<
K
>
(
i
,
indices
,
indices_num
,
instance_size
,
gather_dim_size
)];
}
}
template
<
typename
T
,
typename
K
>
__global__
void
BatchGatherBackwardGpu
(
const
int64_t
elem_cnt
,
const
T
*
out_diff
,
const
K
*
indices
,
const
int64_t
indices_num
,
const
int64_t
instance_size
,
const
int64_t
gather_dim_size
,
T
*
in_diff
)
{
CUDA_1D_KERNEL_LOOP
(
i
,
elem_cnt
)
{
cuda
::
atomic
::
Add
(
in_diff
+
GetInOffset
<
K
>
(
i
,
indices
,
indices_num
,
instance_size
,
gather_dim_size
),
out_diff
[
i
]);
}
}
}
// namespace
template
<
typename
T
,
typename
K
>
struct
BatchGatherKernelUtilImpl
<
DeviceType
::
kCUDA
,
T
,
K
>
final
{
static
void
Forward
(
ep
::
Stream
*
stream
,
const
T
*
in
,
const
K
*
indices
,
const
Shape
&
flat_out_shape
,
const
int64_t
gather_dim_size
,
T
*
out
);
static
void
Backward
(
ep
::
Stream
*
stream
,
const
T
*
out_diff
,
const
K
*
indices
,
const
Shape
&
flat_out_diff_shape
,
const
int64_t
gather_dim_size
,
T
*
in_diff
);
};
template
<
typename
T
,
typename
K
>
void
BatchGatherKernelUtilImpl
<
DeviceType
::
kCUDA
,
T
,
K
>::
Forward
(
ep
::
Stream
*
stream
,
const
T
*
in
,
const
K
*
indices
,
const
Shape
&
flat_out_shape
,
const
int64_t
gather_dim_size
,
T
*
out
)
{
const
int64_t
batch_num
=
flat_out_shape
.
At
(
0
);
const
int64_t
indices_num
=
flat_out_shape
.
At
(
1
);
const
int64_t
instance_size
=
flat_out_shape
.
At
(
2
);
const
int64_t
elem_cnt
=
batch_num
*
indices_num
*
instance_size
;
BatchGatherForwardGpu
<
T
,
K
><<<
BlocksNum4ThreadsNum
(
elem_cnt
),
kCudaThreadsNumPerBlock
,
0
,
stream
->
As
<
ep
::
CudaStream
>
()
->
cuda_stream
()
>>>
(
elem_cnt
,
in
,
indices
,
indices_num
,
instance_size
,
gather_dim_size
,
out
);
}
template
<
typename
T
,
typename
K
>
void
BatchGatherKernelUtilImpl
<
DeviceType
::
kCUDA
,
T
,
K
>::
Backward
(
ep
::
Stream
*
stream
,
const
T
*
out_diff
,
const
K
*
indices
,
const
Shape
&
flat_out_diff_shape
,
const
int64_t
gather_dim_size
,
T
*
in_diff
)
{
const
int64_t
batch_num
=
flat_out_diff_shape
.
At
(
0
);
const
int64_t
indices_num
=
flat_out_diff_shape
.
At
(
1
);
const
int64_t
instance_size
=
flat_out_diff_shape
.
At
(
2
);
const
int64_t
elem_cnt
=
batch_num
*
indices_num
*
instance_size
;
BatchGatherBackwardGpu
<
T
,
K
><<<
BlocksNum4ThreadsNum
(
elem_cnt
),
kCudaThreadsNumPerBlock
,
0
,
stream
->
As
<
ep
::
CudaStream
>
()
->
cuda_stream
()
>>>
(
elem_cnt
,
out_diff
,
indices
,
indices_num
,
instance_size
,
gather_dim_size
,
in_diff
);
}
#define INSTANTIATE_BATCH_GATHER_KERNEL_UTIL_IMPL_CUDA(in_type_pair, index_type_pair) \
template
struct
BatchGatherKernelUtilImpl
<
DeviceType
::
kCUDA
,
OF_PP_PAIR_FIRST
(
in_type_pair
),
\
OF_PP_PAIR_FIRST
(
index_type_pair
)>;
OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE
(
INSTANTIATE_BATCH_GATHER_KERNEL_UTIL_IMPL_CUDA
,
FLOATING_DATA_TYPE_SEQ
,
INT_DATA_TYPE_SEQ
);
#undef INSTANTIATE_BATCH_GATHER_KERNEL_UTIL_IMPL_CUDA
/*
Copyright 2020 The OneFlow Authors. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
#include "hip/hip_runtime.h"
#include "oneflow/user/kernels/batch_gather_kernel_util.h"
#include "oneflow/core/hip/atomic.hip.h"
#include "oneflow/core/ep/rocm/cuda_stream.h"
#include <assert.h>
namespace
oneflow
{
namespace
{
template
<
typename
K
>
__device__
int64_t
GetInOffset
(
const
int64_t
out_offset
,
const
K
*
indices
,
const
int64_t
indices_num
,
const
int64_t
instance_size
,
const
int64_t
gather_dim_size
)
{
const
int64_t
batch_idx
=
out_offset
/
(
indices_num
*
instance_size
);
const
int64_t
indices_idx
=
out_offset
%
(
indices_num
*
instance_size
)
/
instance_size
;
const
int64_t
inner_idx
=
out_offset
%
instance_size
;
const
int64_t
idx
=
indices
[
batch_idx
*
indices_num
+
indices_idx
];
assert
(
idx
>=
0
&&
idx
<
gather_dim_size
);
return
batch_idx
*
gather_dim_size
*
instance_size
+
idx
*
instance_size
+
inner_idx
;
}
template
<
typename
T
,
typename
K
>
__global__
void
BatchGatherForwardGpu
(
const
int64_t
elem_cnt
,
const
T
*
in
,
const
K
*
indices
,
const
int64_t
indices_num
,
const
int64_t
instance_size
,
const
int64_t
gather_dim_size
,
T
*
out
)
{
CUDA_1D_KERNEL_LOOP
(
i
,
elem_cnt
)
{
out
[
i
]
=
in
[
GetInOffset
<
K
>
(
i
,
indices
,
indices_num
,
instance_size
,
gather_dim_size
)];
}
}
template
<
typename
T
,
typename
K
>
__global__
void
BatchGatherBackwardGpu
(
const
int64_t
elem_cnt
,
const
T
*
out_diff
,
const
K
*
indices
,
const
int64_t
indices_num
,
const
int64_t
instance_size
,
const
int64_t
gather_dim_size
,
T
*
in_diff
)
{
CUDA_1D_KERNEL_LOOP
(
i
,
elem_cnt
)
{
cuda
::
atomic
::
Add
(
in_diff
+
GetInOffset
<
K
>
(
i
,
indices
,
indices_num
,
instance_size
,
gather_dim_size
),
out_diff
[
i
]);
}
}
}
// namespace
template
<
typename
T
,
typename
K
>
struct
BatchGatherKernelUtilImpl
<
DeviceType
::
kCUDA
,
T
,
K
>
final
{
static
void
Forward
(
ep
::
Stream
*
stream
,
const
T
*
in
,
const
K
*
indices
,
const
Shape
&
flat_out_shape
,
const
int64_t
gather_dim_size
,
T
*
out
);
static
void
Backward
(
ep
::
Stream
*
stream
,
const
T
*
out_diff
,
const
K
*
indices
,
const
Shape
&
flat_out_diff_shape
,
const
int64_t
gather_dim_size
,
T
*
in_diff
);
};
template
<
typename
T
,
typename
K
>
void
BatchGatherKernelUtilImpl
<
DeviceType
::
kCUDA
,
T
,
K
>::
Forward
(
ep
::
Stream
*
stream
,
const
T
*
in
,
const
K
*
indices
,
const
Shape
&
flat_out_shape
,
const
int64_t
gather_dim_size
,
T
*
out
)
{
const
int64_t
batch_num
=
flat_out_shape
.
At
(
0
);
const
int64_t
indices_num
=
flat_out_shape
.
At
(
1
);
const
int64_t
instance_size
=
flat_out_shape
.
At
(
2
);
const
int64_t
elem_cnt
=
batch_num
*
indices_num
*
instance_size
;
BatchGatherForwardGpu
<
T
,
K
><<<
BlocksNum4ThreadsNum
(
elem_cnt
),
kCudaThreadsNumPerBlock
,
0
,
stream
->
As
<
ep
::
CudaStream
>
()
->
cuda_stream
()
>>>
(
elem_cnt
,
in
,
indices
,
indices_num
,
instance_size
,
gather_dim_size
,
out
);
}
template
<
typename
T
,
typename
K
>
void
BatchGatherKernelUtilImpl
<
DeviceType
::
kCUDA
,
T
,
K
>::
Backward
(
ep
::
Stream
*
stream
,
const
T
*
out_diff
,
const
K
*
indices
,
const
Shape
&
flat_out_diff_shape
,
const
int64_t
gather_dim_size
,
T
*
in_diff
)
{
const
int64_t
batch_num
=
flat_out_diff_shape
.
At
(
0
);
const
int64_t
indices_num
=
flat_out_diff_shape
.
At
(
1
);
const
int64_t
instance_size
=
flat_out_diff_shape
.
At
(
2
);
const
int64_t
elem_cnt
=
batch_num
*
indices_num
*
instance_size
;
BatchGatherBackwardGpu
<
T
,
K
><<<
BlocksNum4ThreadsNum
(
elem_cnt
),
kCudaThreadsNumPerBlock
,
0
,
stream
->
As
<
ep
::
CudaStream
>
()
->
cuda_stream
()
>>>
(
elem_cnt
,
out_diff
,
indices
,
indices_num
,
instance_size
,
gather_dim_size
,
in_diff
);
}
#define INSTANTIATE_BATCH_GATHER_KERNEL_UTIL_IMPL_CUDA(in_type_pair, index_type_pair) \
template struct BatchGatherKernelUtilImpl<DeviceType::kCUDA, OF_PP_PAIR_FIRST(in_type_pair), \
OF_PP_PAIR_FIRST(index_type_pair)>;
OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE
(
INSTANTIATE_BATCH_GATHER_KERNEL_UTIL_IMPL_CUDA
,
FLOATING_DATA_TYPE_SEQ
,
INT_DATA_TYPE_SEQ
);
#undef INSTANTIATE_BATCH_GATHER_KERNEL_UTIL_IMPL_CUDA
}
// namespace oneflow
\ No newline at end of file
oneflow/user/kernels/binary_cross_entropy_kernel.hip.cpp
View file @
8f7de847
/*
Copyright 2020 The OneFlow Authors. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
#include "oneflow/core/framework/framework.h"
#include "oneflow/core/hip/elementwise.hip.h"
#include "oneflow/user/kernels/loss_kernel_util.h"
#include "oneflow/core/ep/rocm/cuda_stream.h"
namespace
oneflow
{
namespace
user_op
{
namespace
{
using
namespace
loss
;
template
<
typename
T
>
struct
BinaryCrossEntropyFunctor
{
T
zero_
;
T
one_
;
T
negative_hundred_
;
BinaryCrossEntropyFunctor
()
:
zero_
(
GetZeroVal
<
T
>
()),
one_
(
GetOneVal
<
T
>
()),
negative_hundred_
(
static_cast
<
T
>
(
-
100
))
{}
__device__
__forceinline__
T
operator
()(
T
input_val
,
T
target_val
)
const
{
assert
(
input_val
>=
zero_
);
assert
(
input_val
<=
one_
);
return
(
target_val
-
one_
)
*
max
(
static_cast
<
T
>
(
log
(
one_
-
input_val
)),
negative_hundred_
)
-
target_val
*
max
(
static_cast
<
T
>
(
log
(
input_val
)),
negative_hundred_
);
}
__device__
__forceinline__
T
operator
()(
T
input_val
,
T
target_val
,
T
weight_val
)
const
{
return
(
*
this
)(
input_val
,
target_val
)
*
weight_val
;
}
};
template
<
>
struct
BinaryCrossEntropyFunctor
<
float
>
{
float
zero_
;
float
one_
;
float
negative_hundred_
;
BinaryCrossEntropyFunctor
()
:
zero_
(
0.
f
),
one_
(
1.
f
),
negative_hundred_
(
-
100.
f
)
{}
__device__
__forceinline__
float
operator
()(
float
input_val
,
float
target_val
)
const
{
assert
(
input_val
>=
zero_
);
assert
(
input_val
<=
one_
);
return
(
target_val
-
one_
)
*
max
(
logf
(
one_
-
input_val
),
negative_hundred_
)
-
target_val
*
max
(
logf
(
input_val
),
negative_hundred_
);
}
__device__
__forceinline__
float
operator
()(
float
input_val
,
float
target_val
,
float
weight_val
)
const
{
return
(
*
this
)(
input_val
,
target_val
)
*
weight_val
;
}
};
template
<
>
struct
BinaryCrossEntropyFunctor
<
half
>
{
BinaryCrossEntropyFunctor
<
float
>
float_functor
;
__device__
__forceinline__
half
operator
()(
half
input_val
,
half
target_val
)
const
{
return
__float2half
(
float_functor
(
__half2float
(
input_val
),
__half2float
(
target_val
)));
}
__device__
__forceinline__
half
operator
()(
half
input_val
,
half
target_val
,
half
weight_val
)
const
{
return
(
*
this
)(
input_val
,
target_val
)
*
weight_val
;
}
};
template
<
typename
T
>
struct
BinaryCrossEntropyGradFunctor
{
T
eps_
;
T
one_
;
BinaryCrossEntropyGradFunctor
()
:
eps_
(
static_cast
<
T
>
(
1e-12
)),
one_
(
GetOneVal
<
T
>
())
{}
__device__
__forceinline__
T
operator
()(
T
input_val
,
T
target_val
,
T
dy_val
)
const
{
return
dy_val
*
(
input_val
-
target_val
)
/
max
((
one_
-
input_val
)
*
input_val
,
eps_
);
}
__device__
__forceinline__
T
operator
()(
T
input_val
,
T
target_val
,
T
dy_val
,
T
weight_val
)
const
{
return
(
*
this
)(
input_val
,
target_val
,
dy_val
)
*
weight_val
;
}
};
template
<
>
struct
BinaryCrossEntropyGradFunctor
<
half
>
{
BinaryCrossEntropyGradFunctor
<
float
>
float_functor
;
BinaryCrossEntropyGradFunctor
()
{}
__device__
__forceinline__
half
operator
()(
half
input_val
,
half
target_val
,
half
dy_val
)
const
{
return
__float2half
(
float_functor
(
__half2float
(
input_val
),
__half2float
(
target_val
),
__half2float
(
dy_val
)));
}
__device__
__forceinline__
half
operator
()(
half
input_val
,
half
target_val
,
half
dy_val
,
half
weight_val
)
const
{
return
__float2half
(
float_functor
(
__half2float
(
input_val
),
__half2float
(
target_val
),
__half2float
(
dy_val
),
__half2float
(
weight_val
)));
}
};
template
<
typename
T
>
class
BinaryCrossEntropyKernel
final
:
public
user_op
::
OpKernel
{
public:
BinaryCrossEntropyKernel
()
=
default
;
~
BinaryCrossEntropyKernel
()
=
default
;
private:
using
user_op
::
OpKernel
::
Compute
;
void
Compute
(
user_op
::
KernelComputeContext
*
ctx
)
const
override
{
const
auto
*
input_blob
=
ctx
->
Tensor4ArgNameAndIndex
(
"input"
,
0
);
const
auto
*
target_blob
=
ctx
->
Tensor4ArgNameAndIndex
(
"target"
,
0
);
auto
*
out_blob
=
ctx
->
Tensor4ArgNameAndIndex
(
"out"
,
0
);
const
int64_t
elem_cnt
=
input_blob
->
shape_view
().
elem_cnt
();
const
T
*
input
=
input_blob
->
dptr
<
T
>
();
const
T
*
target
=
target_blob
->
dptr
<
T
>
();
T
*
out
=
out_blob
->
mut_dptr
<
T
>
();
if
(
ctx
->
has_input
(
"weight"
,
0
))
{
const
T
*
weight
=
ctx
->
Tensor4ArgNameAndIndex
(
"weight"
,
0
)
->
dptr
<
T
>
();
OF_CUDA_CHECK
(
(
cuda
::
elementwise
::
Ternary
(
BinaryCrossEntropyFunctor
<
T
>
(),
elem_cnt
,
out
,
input
,
target
,
weight
,
ctx
->
stream
()
->
As
<
ep
::
CudaStream
>
()
->
cuda_stream
())));
}
else
{
OF_CUDA_CHECK
(
(
cuda
::
elementwise
::
Binary
(
BinaryCrossEntropyFunctor
<
T
>
(),
elem_cnt
,
out
,
input
,
target
,
ctx
->
stream
()
->
As
<
ep
::
CudaStream
>
()
->
cuda_stream
())));
}
}
bool
AlwaysComputeWhenAllOutputsEmpty
()
const
override
{
return
false
;
}
};
template
<
typename
T
>
class
BinaryCrossEntropyGradKernel
final
:
public
user_op
::
OpKernel
{
public:
BinaryCrossEntropyGradKernel
()
=
default
;
~
BinaryCrossEntropyGradKernel
()
=
default
;
private:
using
user_op
::
OpKernel
::
Compute
;
void
Compute
(
user_op
::
KernelComputeContext
*
ctx
)
const
override
{
const
auto
*
input_blob
=
ctx
->
Tensor4ArgNameAndIndex
(
"input"
,
0
);
const
auto
*
target_blob
=
ctx
->
Tensor4ArgNameAndIndex
(
"target"
,
0
);
const
auto
*
dy_blob
=
ctx
->
Tensor4ArgNameAndIndex
(
"dy"
,
0
);
auto
*
dx_blob
=
ctx
->
Tensor4ArgNameAndIndex
(
"dx"
,
0
);
const
int64_t
elem_cnt
=
input_blob
->
shape_view
().
elem_cnt
();
const
T
*
dy
=
dy_blob
->
dptr
<
T
>
();
const
T
*
input
=
input_blob
->
dptr
<
T
>
();
const
T
*
target
=
target_blob
->
dptr
<
T
>
();
T
*
dx
=
dx_blob
->
mut_dptr
<
T
>
();
if
(
ctx
->
has_input
(
"weight"
,
0
))
{
const
T
*
weight
=
ctx
->
Tensor4ArgNameAndIndex
(
"weight"
,
0
)
->
dptr
<
T
>
();
using
FunctorT
=
BinaryCrossEntropyGradFunctor
<
T
>
;
using
FactoryT
=
cuda
::
elementwise
::
SimpleFactory
<
FunctorT
>
;
OF_CUDA_CHECK
((
cuda
::
elementwise
::
GenericLauncher
<
FactoryT
,
T
,
T
,
T
,
T
,
T
>::
Launch
(
FactoryT
(
FunctorT
()),
elem_cnt
,
dx
,
input
,
target
,
dy
,
weight
,
ctx
->
stream
()
->
As
<
ep
::
CudaStream
>
()
->
cuda_stream
())));
}
else
{
OF_CUDA_CHECK
((
cuda
::
elementwise
::
Ternary
(
BinaryCrossEntropyGradFunctor
<
T
>
(),
elem_cnt
,
dx
,
input
,
target
,
dy
,
ctx
->
stream
()
->
As
<
ep
::
CudaStream
>
()
->
cuda_stream
())));
}
}
bool
AlwaysComputeWhenAllOutputsEmpty
()
const
override
{
return
false
;
}
};
}
// namespace
#define REGISTER_BINARY_CROSS_ENTROPY_KERNEL(dtype) \
REGISTER_USER_KERNEL
(
"binary_cross_entropy"
)
\
.
SetCreateFn
<
BinaryCrossEntropyKernel
<
dtype
>>
()
\
.
SetIsMatchedHob
((
user_op
::
HobDeviceType
()
==
DeviceType
::
kCUDA
)
\
&&
(
user_op
::
HobDataType
(
"input"
,
0
)
==
GetDataType
<
dtype
>::
value
)
\
&&
(
user_op
::
HobDataType
(
"target"
,
0
)
==
GetDataType
<
dtype
>::
value
)
\
&&
(
user_op
::
HobDataType
(
"out"
,
0
)
==
GetDataType
<
dtype
>::
value
));
#define REGISTER_BINARY_CROSS_ENTROPY_GRAD_KERNEL(dtype) \
REGISTER_USER_KERNEL
(
"binary_cross_entropy_grad"
)
\
.
SetCreateFn
<
BinaryCrossEntropyGradKernel
<
dtype
>>
()
\
.
SetIsMatchedHob
((
user_op
::
HobDeviceType
()
==
DeviceType
::
kCUDA
)
\
&&
(
user_op
::
HobDataType
(
"input"
,
0
)
==
GetDataType
<
dtype
>::
value
)
\
&&
(
user_op
::
HobDataType
(
"target"
,
0
)
==
GetDataType
<
dtype
>::
value
)
\
&&
(
user_op
::
HobDataType
(
"dy"
,
0
)
==
GetDataType
<
dtype
>::
value
)
\
&&
(
user_op
::
HobDataType
(
"dx"
,
0
)
==
GetDataType
<
dtype
>::
value
));
REGISTER_BINARY_CROSS_ENTROPY_KERNEL
(
half
)
REGISTER_BINARY_CROSS_ENTROPY_KERNEL
(
float
)
REGISTER_BINARY_CROSS_ENTROPY_KERNEL
(
double
)
REGISTER_BINARY_CROSS_ENTROPY_GRAD_KERNEL
(
half
)
REGISTER_BINARY_CROSS_ENTROPY_GRAD_KERNEL
(
float
)
REGISTER_BINARY_CROSS_ENTROPY_GRAD_KERNEL
(
double
)
}
// namespace user_op
/*
Copyright 2020 The OneFlow Authors. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
#include "oneflow/core/framework/framework.h"
#include "oneflow/core/hip/elementwise.hip.h"
#include "oneflow/user/kernels/loss_kernel_util.h"
#include "oneflow/core/ep/rocm/cuda_stream.h"
namespace
oneflow
{
namespace
user_op
{
namespace
{
using
namespace
loss
;
template
<
typename
T
>
struct
BinaryCrossEntropyFunctor
{
T
zero_
;
T
one_
;
T
negative_hundred_
;
BinaryCrossEntropyFunctor
()
:
zero_
(
GetZeroVal
<
T
>
()),
one_
(
GetOneVal
<
T
>
()),
negative_hundred_
(
static_cast
<
T
>
(
-
100
))
{}
__device__
__forceinline__
T
operator
()(
T
input_val
,
T
target_val
)
const
{
assert
(
input_val
>=
zero_
);
assert
(
input_val
<=
one_
);
return
(
target_val
-
one_
)
*
max
(
static_cast
<
T
>
(
log
(
one_
-
input_val
)),
negative_hundred_
)
-
target_val
*
max
(
static_cast
<
T
>
(
log
(
input_val
)),
negative_hundred_
);
}
__device__
__forceinline__
T
operator
()(
T
input_val
,
T
target_val
,
T
weight_val
)
const
{
return
(
*
this
)(
input_val
,
target_val
)
*
weight_val
;
}
};
template
<
>
struct
BinaryCrossEntropyFunctor
<
float
>
{
float
zero_
;
float
one_
;
float
negative_hundred_
;
BinaryCrossEntropyFunctor
()
:
zero_
(
0.
f
),
one_
(
1.
f
),
negative_hundred_
(
-
100.
f
)
{}
__device__
__forceinline__
float
operator
()(
float
input_val
,
float
target_val
)
const
{
assert
(
input_val
>=
zero_
);
assert
(
input_val
<=
one_
);
return
(
target_val
-
one_
)
*
max
(
logf
(
one_
-
input_val
),
negative_hundred_
)
-
target_val
*
max
(
logf
(
input_val
),
negative_hundred_
);
}
__device__
__forceinline__
float
operator
()(
float
input_val
,
float
target_val
,
float
weight_val
)
const
{
return
(
*
this
)(
input_val
,
target_val
)
*
weight_val
;
}
};
template
<
>
struct
BinaryCrossEntropyFunctor
<
half
>
{
BinaryCrossEntropyFunctor
<
float
>
float_functor
;
__device__
__forceinline__
half
operator
()(
half
input_val
,
half
target_val
)
const
{
return
__float2half
(
float_functor
(
__half2float
(
input_val
),
__half2float
(
target_val
)));
}
__device__
__forceinline__
half
operator
()(
half
input_val
,
half
target_val
,
half
weight_val
)
const
{
return
(
*
this
)(
input_val
,
target_val
)
*
weight_val
;
}
};
template
<
typename
T
>
struct
BinaryCrossEntropyGradFunctor
{
T
eps_
;
T
one_
;
BinaryCrossEntropyGradFunctor
()
:
eps_
(
static_cast
<
T
>
(
1e-12
)),
one_
(
GetOneVal
<
T
>
())
{}
__device__
__forceinline__
T
operator
()(
T
input_val
,
T
target_val
,
T
dy_val
)
const
{
return
dy_val
*
(
input_val
-
target_val
)
/
max
((
one_
-
input_val
)
*
input_val
,
eps_
);
}
__device__
__forceinline__
T
operator
()(
T
input_val
,
T
target_val
,
T
dy_val
,
T
weight_val
)
const
{
return
(
*
this
)(
input_val
,
target_val
,
dy_val
)
*
weight_val
;
}
};
template
<
>
struct
BinaryCrossEntropyGradFunctor
<
half
>
{
BinaryCrossEntropyGradFunctor
<
float
>
float_functor
;
BinaryCrossEntropyGradFunctor
()
{}
__device__
__forceinline__
half
operator
()(
half
input_val
,
half
target_val
,
half
dy_val
)
const
{
return
__float2half
(
float_functor
(
__half2float
(
input_val
),
__half2float
(
target_val
),
__half2float
(
dy_val
)));
}
__device__
__forceinline__
half
operator
()(
half
input_val
,
half
target_val
,
half
dy_val
,
half
weight_val
)
const
{
return
__float2half
(
float_functor
(
__half2float
(
input_val
),
__half2float
(
target_val
),
__half2float
(
dy_val
),
__half2float
(
weight_val
)));
}
};
template
<
typename
T
>
class
BinaryCrossEntropyKernel
final
:
public
user_op
::
OpKernel
{
public:
BinaryCrossEntropyKernel
()
=
default
;
~
BinaryCrossEntropyKernel
()
=
default
;
private:
using
user_op
::
OpKernel
::
Compute
;
void
Compute
(
user_op
::
KernelComputeContext
*
ctx
)
const
override
{
const
auto
*
input_blob
=
ctx
->
Tensor4ArgNameAndIndex
(
"input"
,
0
);
const
auto
*
target_blob
=
ctx
->
Tensor4ArgNameAndIndex
(
"target"
,
0
);
auto
*
out_blob
=
ctx
->
Tensor4ArgNameAndIndex
(
"out"
,
0
);
const
int64_t
elem_cnt
=
input_blob
->
shape_view
().
elem_cnt
();
const
T
*
input
=
input_blob
->
dptr
<
T
>
();
const
T
*
target
=
target_blob
->
dptr
<
T
>
();
T
*
out
=
out_blob
->
mut_dptr
<
T
>
();
if
(
ctx
->
has_input
(
"weight"
,
0
))
{
const
T
*
weight
=
ctx
->
Tensor4ArgNameAndIndex
(
"weight"
,
0
)
->
dptr
<
T
>
();
OF_CUDA_CHECK
(
(
cuda
::
elementwise
::
Ternary
(
BinaryCrossEntropyFunctor
<
T
>
(),
elem_cnt
,
out
,
input
,
target
,
weight
,
ctx
->
stream
()
->
As
<
ep
::
CudaStream
>
()
->
cuda_stream
())));
}
else
{
OF_CUDA_CHECK
(
(
cuda
::
elementwise
::
Binary
(
BinaryCrossEntropyFunctor
<
T
>
(),
elem_cnt
,
out
,
input
,
target
,
ctx
->
stream
()
->
As
<
ep
::
CudaStream
>
()
->
cuda_stream
())));
}
}
bool
AlwaysComputeWhenAllOutputsEmpty
()
const
override
{
return
false
;
}
};
template
<
typename
T
>
class
BinaryCrossEntropyGradKernel
final
:
public
user_op
::
OpKernel
{
public:
BinaryCrossEntropyGradKernel
()
=
default
;
~
BinaryCrossEntropyGradKernel
()
=
default
;
private:
using
user_op
::
OpKernel
::
Compute
;
void
Compute
(
user_op
::
KernelComputeContext
*
ctx
)
const
override
{
const
auto
*
input_blob
=
ctx
->
Tensor4ArgNameAndIndex
(
"input"
,
0
);
const
auto
*
target_blob
=
ctx
->
Tensor4ArgNameAndIndex
(
"target"
,
0
);
const
auto
*
dy_blob
=
ctx
->
Tensor4ArgNameAndIndex
(
"dy"
,
0
);
auto
*
dx_blob
=
ctx
->
Tensor4ArgNameAndIndex
(
"dx"
,
0
);
const
int64_t
elem_cnt
=
input_blob
->
shape_view
().
elem_cnt
();
const
T
*
dy
=
dy_blob
->
dptr
<
T
>
();
const
T
*
input
=
input_blob
->
dptr
<
T
>
();
const
T
*
target
=
target_blob
->
dptr
<
T
>
();
T
*
dx
=
dx_blob
->
mut_dptr
<
T
>
();
if
(
ctx
->
has_input
(
"weight"
,
0
))
{
const
T
*
weight
=
ctx
->
Tensor4ArgNameAndIndex
(
"weight"
,
0
)
->
dptr
<
T
>
();
using
FunctorT
=
BinaryCrossEntropyGradFunctor
<
T
>
;
using
FactoryT
=
cuda
::
elementwise
::
SimpleFactory
<
FunctorT
>
;
OF_CUDA_CHECK
((
cuda
::
elementwise
::
GenericLauncher
<
FactoryT
,
T
,
T
,
T
,
T
,
T
>::
Launch
(
FactoryT
(
FunctorT
()),
elem_cnt
,
dx
,
input
,
target
,
dy
,
weight
,
ctx
->
stream
()
->
As
<
ep
::
CudaStream
>
()
->
cuda_stream
())));
}
else
{
OF_CUDA_CHECK
((
cuda
::
elementwise
::
Ternary
(
BinaryCrossEntropyGradFunctor
<
T
>
(),
elem_cnt
,
dx
,
input
,
target
,
dy
,
ctx
->
stream
()
->
As
<
ep
::
CudaStream
>
()
->
cuda_stream
())));
}
}
bool
AlwaysComputeWhenAllOutputsEmpty
()
const
override
{
return
false
;
}
};
}
// namespace
#define REGISTER_BINARY_CROSS_ENTROPY_KERNEL(dtype) \
REGISTER_USER_KERNEL("binary_cross_entropy") \
.SetCreateFn<BinaryCrossEntropyKernel<dtype>>() \
.SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \
&& (user_op::HobDataType("input", 0) == GetDataType<dtype>::value) \
&& (user_op::HobDataType("target", 0) == GetDataType<dtype>::value) \
&& (user_op::HobDataType("out", 0) == GetDataType<dtype>::value));
#define REGISTER_BINARY_CROSS_ENTROPY_GRAD_KERNEL(dtype) \
REGISTER_USER_KERNEL("binary_cross_entropy_grad") \
.SetCreateFn<BinaryCrossEntropyGradKernel<dtype>>() \
.SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \
&& (user_op::HobDataType("input", 0) == GetDataType<dtype>::value) \
&& (user_op::HobDataType("target", 0) == GetDataType<dtype>::value) \
&& (user_op::HobDataType("dy", 0) == GetDataType<dtype>::value) \
&& (user_op::HobDataType("dx", 0) == GetDataType<dtype>::value));
REGISTER_BINARY_CROSS_ENTROPY_KERNEL
(
half
)
REGISTER_BINARY_CROSS_ENTROPY_KERNEL
(
float
)
REGISTER_BINARY_CROSS_ENTROPY_KERNEL
(
double
)
REGISTER_BINARY_CROSS_ENTROPY_GRAD_KERNEL
(
half
)
REGISTER_BINARY_CROSS_ENTROPY_GRAD_KERNEL
(
float
)
REGISTER_BINARY_CROSS_ENTROPY_GRAD_KERNEL
(
double
)
}
// namespace user_op
}
// namespace oneflow
\ No newline at end of file
oneflow/user/kernels/binary_cross_entropy_with_logits_kernel.hip.cpp
View file @
8f7de847
/*
Copyright 2020 The OneFlow Authors. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
#include "oneflow/core/framework/framework.h"
#include "oneflow/core/hip/elementwise.hip.h"
#include "oneflow/core/ndarray/ndarray_util.h"
#include "oneflow/core/ndarray/xpu_var_ndarray.h"
#include "oneflow/user/kernels/loss_kernel_util.h"
#include "oneflow/core/ep/rocm/cuda_stream.h"
namespace
oneflow
{
namespace
user_op
{
namespace
{
using
namespace
loss
;
enum
class
WeightType
{
kNone
,
kWeight
,
kPosWeight
,
kBoth
,
};
template
<
typename
T
,
WeightType
WEIGHT_TYPE
>
struct
BinaryCrossEntropyWithLogitsFunctor
;
template
<
typename
T
>
struct
BinaryCrossEntropyWithLogitsFunctor
<
T
,
WeightType
::
kNone
>
{
T
zero_
;
T
one_
;
BinaryCrossEntropyWithLogitsFunctor
()
:
zero_
(
GetZeroVal
<
T
>
()),
one_
(
GetOneVal
<
T
>
())
{}
__device__
__forceinline__
T
operator
()(
T
input_val
,
T
target_val
)
const
{
const
T
max_val
=
-
input_val
<
zero_
?
zero_
:
-
input_val
;
return
(
one_
-
target_val
)
*
input_val
+
max_val
+
(
log
(
exp
(
-
max_val
)
+
exp
(
-
input_val
-
max_val
)));
}
};
template
<
typename
T
>
struct
BinaryCrossEntropyWithLogitsFunctor
<
T
,
WeightType
::
kPosWeight
>
{
T
zero_
;
T
one_
;
BinaryCrossEntropyWithLogitsFunctor
()
:
zero_
(
GetZeroVal
<
T
>
()),
one_
(
GetOneVal
<
T
>
())
{}
__device__
__forceinline__
T
operator
()(
T
input_val
,
T
target_val
,
T
weight_val
)
const
{
const
T
max_val
=
-
input_val
<
zero_
?
zero_
:
-
input_val
;
const
T
pos_weight_processed_val
=
weight_val
-
target_val
+
one_
;
return
(
one_
-
target_val
)
*
input_val
+
(
pos_weight_processed_val
*
(
log
(
exp
(
-
max_val
)
+
exp
(
-
input_val
-
max_val
))
+
max_val
));
}
};
template
<
>
struct
BinaryCrossEntropyWithLogitsFunctor
<
float
,
WeightType
::
kNone
>
{
float
zero_
;
float
one_
;
BinaryCrossEntropyWithLogitsFunctor
()
:
zero_
(
0.
f
),
one_
(
1.
f
)
{}
__device__
__forceinline__
float
operator
()(
float
input_val
,
float
target_val
)
const
{
const
float
max_val
=
-
input_val
<
zero_
?
zero_
:
-
input_val
;
return
(
one_
-
target_val
)
*
input_val
+
max_val
+
(
logf
(
expf
(
-
max_val
)
+
expf
(
-
input_val
-
max_val
)));
}
};
template
<
>
struct
BinaryCrossEntropyWithLogitsFunctor
<
float
,
WeightType
::
kPosWeight
>
{
float
zero_
;
float
one_
;
BinaryCrossEntropyWithLogitsFunctor
()
:
zero_
(
0.
f
),
one_
(
1.
f
)
{}
__device__
__forceinline__
float
operator
()(
float
input_val
,
float
target_val
,
float
weight_val
)
const
{
const
float
max_val
=
-
input_val
<
zero_
?
zero_
:
-
input_val
;
const
float
pos_weight_processed_val
=
weight_val
-
target_val
+
one_
;
return
(
one_
-
target_val
)
*
input_val
+
(
pos_weight_processed_val
*
(
logf
(
expf
(
-
max_val
)
+
expf
(
-
input_val
-
max_val
))
+
max_val
));
}
};
template
<
typename
T
>
struct
BinaryCrossEntropyWithLogitsFunctor
<
T
,
WeightType
::
kWeight
>
{
BinaryCrossEntropyWithLogitsFunctor
<
T
,
WeightType
::
kNone
>
f
;
__device__
__forceinline__
T
operator
()(
T
input_val
,
T
target_val
,
T
weight_val
)
const
{
return
f
(
input_val
,
target_val
)
*
weight_val
;
}
};
template
<
typename
T
>
struct
BinaryCrossEntropyWithLogitsFunctor
<
T
,
WeightType
::
kBoth
>
{
BinaryCrossEntropyWithLogitsFunctor
<
T
,
WeightType
::
kPosWeight
>
f
;
__device__
__forceinline__
T
operator
()(
T
input_val
,
T
target_val
,
T
weight_val
,
T
pos_weight_val
)
const
{
return
f
(
input_val
,
target_val
,
pos_weight_val
)
*
weight_val
;
}
};
template
<
>
struct
BinaryCrossEntropyWithLogitsFunctor
<
half
,
WeightType
::
kNone
>
{
BinaryCrossEntropyWithLogitsFunctor
<
float
,
WeightType
::
kNone
>
f
;
__device__
__forceinline__
half
operator
()(
half
input_val
,
half
target_val
)
const
{
return
__float2half
(
f
(
__half2float
(
input_val
),
__half2float
(
target_val
)));
}
};
template
<
>
struct
BinaryCrossEntropyWithLogitsFunctor
<
half
,
WeightType
::
kPosWeight
>
{
BinaryCrossEntropyWithLogitsFunctor
<
float
,
WeightType
::
kPosWeight
>
f
;
__device__
__forceinline__
half
operator
()(
half
input_val
,
half
target_val
,
half
weight_val
)
const
{
return
__float2half
(
f
(
__half2float
(
input_val
),
__half2float
(
target_val
),
__half2float
(
weight_val
)));
}
};
template
<
>
struct
BinaryCrossEntropyWithLogitsFunctor
<
half
,
WeightType
::
kWeight
>
{
BinaryCrossEntropyWithLogitsFunctor
<
float
,
WeightType
::
kWeight
>
f
;
__device__
__forceinline__
half
operator
()(
half
input_val
,
half
target_val
,
half
weight_val
)
const
{
return
__float2half
(
f
(
__half2float
(
input_val
),
__half2float
(
target_val
),
__half2float
(
weight_val
)));
}
};
template
<
>
struct
BinaryCrossEntropyWithLogitsFunctor
<
half
,
WeightType
::
kBoth
>
{
BinaryCrossEntropyWithLogitsFunctor
<
float
,
WeightType
::
kBoth
>
f
;
__device__
__forceinline__
half
operator
()(
half
input_val
,
half
target_val
,
half
weight_val
,
half
pos_weight_val
)
const
{
return
__float2half
(
f
(
__half2float
(
input_val
),
__half2float
(
target_val
),
__half2float
(
weight_val
),
__half2float
(
pos_weight_val
)));
}
};
template
<
typename
T
>
__device__
__forceinline__
T
CalSigmoid
(
const
T
x
)
{
const
T
half_of_one
=
static_cast
<
T
>
(
0.5
);
return
half_of_one
*
tanh
(
half_of_one
*
x
)
+
half_of_one
;
}
template
<
>
__device__
__forceinline__
float
CalSigmoid
(
const
float
x
)
{
const
float
half_of_one
=
static_cast
<
float
>
(
0.5
);
return
half_of_one
*
tanhf
(
half_of_one
*
x
)
+
half_of_one
;
}
template
<
>
__device__
__forceinline__
half
CalSigmoid
(
const
half
x
)
{
return
__float2half
(
CalSigmoid
(
__half2float
(
x
)));
}
template
<
typename
T
,
WeightType
WEIGHT_TYPE
>
struct
BinaryCrossEntropyWithLogitsGradFunctor
;
template
<
typename
T
>
struct
BinaryCrossEntropyWithLogitsGradFunctor
<
T
,
WeightType
::
kNone
>
{
__device__
__forceinline__
T
operator
()(
T
input_val
,
T
target_val
,
T
dy_val
)
const
{
return
(
CalSigmoid
(
input_val
)
-
target_val
)
*
dy_val
;
}
};
template
<
typename
T
>
struct
BinaryCrossEntropyWithLogitsGradFunctor
<
T
,
WeightType
::
kPosWeight
>
{
T
one_
;
BinaryCrossEntropyWithLogitsGradFunctor
()
:
one_
(
GetOneVal
<
T
>
())
{}
__device__
__forceinline__
T
operator
()(
T
input_val
,
T
target_val
,
T
dy_val
,
T
weight_val
)
const
{
return
dy_val
*
((
weight_val
+
one_
-
target_val
)
*
CalSigmoid
(
input_val
)
-
weight_val
);
}
};
template
<
typename
T
>
struct
BinaryCrossEntropyWithLogitsGradFunctor
<
T
,
WeightType
::
kWeight
>
{
BinaryCrossEntropyWithLogitsGradFunctor
<
T
,
WeightType
::
kNone
>
f
;
__device__
__forceinline__
T
operator
()(
T
input_val
,
T
target_val
,
T
dy_val
,
T
weight_val
)
const
{
return
f
(
input_val
,
target_val
,
dy_val
)
*
weight_val
;
}
};
template
<
typename
T
>
struct
BinaryCrossEntropyWithLogitsGradFunctor
<
T
,
WeightType
::
kBoth
>
{
BinaryCrossEntropyWithLogitsGradFunctor
<
T
,
WeightType
::
kPosWeight
>
f
;
__device__
__forceinline__
T
operator
()(
T
input_val
,
T
target_val
,
T
dy_val
,
T
weight_val
,
T
pos_weight_val
)
const
{
return
f
(
input_val
,
target_val
,
dy_val
,
pos_weight_val
)
*
weight_val
;
}
};
template
<
typename
T
>
class
BinaryCrossEntropyWithLogitsKernel
final
:
public
user_op
::
OpKernel
{
public:
BinaryCrossEntropyWithLogitsKernel
()
=
default
;
~
BinaryCrossEntropyWithLogitsKernel
()
override
=
default
;
private:
using
user_op
::
OpKernel
::
Compute
;
void
Compute
(
user_op
::
KernelComputeContext
*
ctx
)
const
override
{
const
auto
*
input_blob
=
ctx
->
Tensor4ArgNameAndIndex
(
"input"
,
0
);
const
auto
*
target_blob
=
ctx
->
Tensor4ArgNameAndIndex
(
"target"
,
0
);
auto
*
out_blob
=
ctx
->
Tensor4ArgNameAndIndex
(
"out"
,
0
);
auto
*
tmp_buffer_blob
=
ctx
->
Tensor4ArgNameAndIndex
(
"tmp_buffer"
,
0
);
const
int64_t
elem_cnt
=
input_blob
->
shape_view
().
elem_cnt
();
const
T
*
input
=
input_blob
->
dptr
<
T
>
();
const
T
*
target
=
target_blob
->
dptr
<
T
>
();
T
*
out
=
out_blob
->
mut_dptr
<
T
>
();
if
(
ctx
->
Attr
<
bool
>
(
"has_pos_weight"
))
{
T
*
pos_weight_processed
=
tmp_buffer_blob
->
mut_dptr
<
T
>
();
const
T
*
pos_weight
=
ctx
->
Tensor4ArgNameAndIndex
(
"pos_weight"
,
0
)
->
dptr
<
T
>
();
Shape
pos_weight_shape
=
Shape
::
Ones
(
target_blob
->
shape_view
().
NumAxes
());
pos_weight_shape
.
Set
(
pos_weight_shape
.
NumAxes
()
-
1
,
ctx
->
Tensor4ArgNameAndIndex
(
"pos_weight"
,
0
)
->
shape_view
().
elem_cnt
());
NdarrayUtil
<
DeviceType
::
kCUDA
,
T
>::
BroadcastMul
(
ctx
->
stream
(),
XpuVarNdarray
<
T
>
(
target_blob
->
shape_view
(),
pos_weight_processed
),
XpuVarNdarray
<
const
T
>
(
pos_weight_shape
,
pos_weight
),
XpuVarNdarray
<
const
T
>
(
target_blob
->
shape_view
(),
target
));
if
(
ctx
->
has_input
(
"weight"
,
0
))
{
const
T
*
weight
=
ctx
->
Tensor4ArgNameAndIndex
(
"weight"
,
0
)
->
dptr
<
T
>
();
using
FunctorT
=
BinaryCrossEntropyWithLogitsFunctor
<
T
,
WeightType
::
kBoth
>
;
using
FactoryT
=
cuda
::
elementwise
::
SimpleFactory
<
FunctorT
>
;
OF_CUDA_CHECK
((
cuda
::
elementwise
::
GenericLauncher
<
FactoryT
,
T
,
T
,
T
,
T
,
T
>::
Launch
(
FactoryT
(
FunctorT
()),
elem_cnt
,
out
,
input
,
target
,
weight
,
pos_weight_processed
,
ctx
->
stream
()
->
As
<
ep
::
CudaStream
>
()
->
cuda_stream
())));
}
else
{
OF_CUDA_CHECK
((
cuda
::
elementwise
::
Ternary
(
BinaryCrossEntropyWithLogitsFunctor
<
T
,
WeightType
::
kPosWeight
>
(),
elem_cnt
,
out
,
input
,
target
,
pos_weight_processed
,
ctx
->
stream
()
->
As
<
ep
::
CudaStream
>
()
->
cuda_stream
())));
}
}
else
{
if
(
ctx
->
has_input
(
"weight"
,
0
))
{
const
T
*
weight
=
ctx
->
Tensor4ArgNameAndIndex
(
"weight"
,
0
)
->
dptr
<
T
>
();
OF_CUDA_CHECK
((
cuda
::
elementwise
::
Ternary
(
BinaryCrossEntropyWithLogitsFunctor
<
T
,
WeightType
::
kWeight
>
(),
elem_cnt
,
out
,
input
,
target
,
weight
,
ctx
->
stream
()
->
As
<
ep
::
CudaStream
>
()
->
cuda_stream
())));
}
else
{
OF_CUDA_CHECK
((
cuda
::
elementwise
::
Binary
(
BinaryCrossEntropyWithLogitsFunctor
<
T
,
WeightType
::
kNone
>
(),
elem_cnt
,
out
,
input
,
target
,
ctx
->
stream
()
->
As
<
ep
::
CudaStream
>
()
->
cuda_stream
())));
}
}
}
bool
AlwaysComputeWhenAllOutputsEmpty
()
const
override
{
return
false
;
}
};
template
<
typename
T
>
class
BinaryCrossEntropyWithLogitsGradKernel
final
:
public
user_op
::
OpKernel
{
public:
BinaryCrossEntropyWithLogitsGradKernel
()
=
default
;
~
BinaryCrossEntropyWithLogitsGradKernel
()
=
default
;
private:
using
user_op
::
OpKernel
::
Compute
;
void
Compute
(
user_op
::
KernelComputeContext
*
ctx
)
const
override
{
const
auto
*
input_blob
=
ctx
->
Tensor4ArgNameAndIndex
(
"input"
,
0
);
const
auto
*
target_blob
=
ctx
->
Tensor4ArgNameAndIndex
(
"target"
,
0
);
const
auto
*
dy_blob
=
ctx
->
Tensor4ArgNameAndIndex
(
"dy"
,
0
);
auto
*
dx_blob
=
ctx
->
Tensor4ArgNameAndIndex
(
"dx"
,
0
);
auto
*
tmp_buffer_blob
=
ctx
->
Tensor4ArgNameAndIndex
(
"tmp_buffer"
,
0
);
const
int64_t
elem_cnt
=
input_blob
->
shape_view
().
elem_cnt
();
const
T
*
dy
=
dy_blob
->
dptr
<
T
>
();
const
T
*
input
=
input_blob
->
dptr
<
T
>
();
const
T
*
target
=
target_blob
->
dptr
<
T
>
();
T
*
dx
=
dx_blob
->
mut_dptr
<
T
>
();
if
(
ctx
->
Attr
<
bool
>
(
"has_pos_weight"
))
{
T
*
pos_weight_processed
=
tmp_buffer_blob
->
mut_dptr
<
T
>
();
const
T
*
pos_weight
=
ctx
->
Tensor4ArgNameAndIndex
(
"pos_weight"
,
0
)
->
dptr
<
T
>
();
Shape
pos_weight_shape
=
Shape
::
Ones
(
target_blob
->
shape_view
().
NumAxes
());
pos_weight_shape
.
Set
(
pos_weight_shape
.
NumAxes
()
-
1
,
ctx
->
Tensor4ArgNameAndIndex
(
"pos_weight"
,
0
)
->
shape_view
().
elem_cnt
());
NdarrayUtil
<
DeviceType
::
kCUDA
,
T
>::
BroadcastMul
(
ctx
->
stream
(),
XpuVarNdarray
<
T
>
(
target_blob
->
shape_view
(),
pos_weight_processed
),
XpuVarNdarray
<
const
T
>
(
pos_weight_shape
,
pos_weight
),
XpuVarNdarray
<
const
T
>
(
target_blob
->
shape_view
(),
target
));
if
(
ctx
->
has_input
(
"weight"
,
0
))
{
const
T
*
weight
=
ctx
->
Tensor4ArgNameAndIndex
(
"weight"
,
0
)
->
dptr
<
T
>
();
using
FunctorT
=
BinaryCrossEntropyWithLogitsGradFunctor
<
T
,
WeightType
::
kBoth
>
;
using
FactoryT
=
cuda
::
elementwise
::
SimpleFactory
<
FunctorT
>
;
OF_CUDA_CHECK
((
cuda
::
elementwise
::
GenericLauncher
<
FactoryT
,
T
,
T
,
T
,
T
,
T
,
T
>::
Launch
(
FactoryT
(
FunctorT
()),
elem_cnt
,
dx
,
input
,
target
,
dy
,
weight
,
pos_weight_processed
,
ctx
->
stream
()
->
As
<
ep
::
CudaStream
>
()
->
cuda_stream
())));
}
else
{
using
FunctorT
=
BinaryCrossEntropyWithLogitsGradFunctor
<
T
,
WeightType
::
kPosWeight
>
;
using
FactoryT
=
cuda
::
elementwise
::
SimpleFactory
<
FunctorT
>
;
OF_CUDA_CHECK
((
cuda
::
elementwise
::
GenericLauncher
<
FactoryT
,
T
,
T
,
T
,
T
,
T
>::
Launch
(
FactoryT
(
FunctorT
()),
elem_cnt
,
dx
,
input
,
target
,
dy
,
pos_weight_processed
,
ctx
->
stream
()
->
As
<
ep
::
CudaStream
>
()
->
cuda_stream
())));
}
}
else
{
if
(
ctx
->
has_input
(
"weight"
,
0
))
{
const
T
*
weight
=
ctx
->
Tensor4ArgNameAndIndex
(
"weight"
,
0
)
->
dptr
<
T
>
();
using
FunctorT
=
BinaryCrossEntropyWithLogitsGradFunctor
<
T
,
WeightType
::
kWeight
>
;
using
FactoryT
=
cuda
::
elementwise
::
SimpleFactory
<
FunctorT
>
;
OF_CUDA_CHECK
((
cuda
::
elementwise
::
GenericLauncher
<
FactoryT
,
T
,
T
,
T
,
T
,
T
>::
Launch
(
FactoryT
(
FunctorT
()),
elem_cnt
,
dx
,
input
,
target
,
dy
,
weight
,
ctx
->
stream
()
->
As
<
ep
::
CudaStream
>
()
->
cuda_stream
())));
}
else
{
OF_CUDA_CHECK
((
cuda
::
elementwise
::
Ternary
(
BinaryCrossEntropyWithLogitsGradFunctor
<
T
,
WeightType
::
kNone
>
(),
elem_cnt
,
dx
,
input
,
target
,
dy
,
ctx
->
stream
()
->
As
<
ep
::
CudaStream
>
()
->
cuda_stream
())));
}
}
}
bool
AlwaysComputeWhenAllOutputsEmpty
()
const
override
{
return
false
;
}
};
template
<
typename
T
>
user_op
::
InferTmpSizeFn
GenFwInferTmpSizeFn
()
{
return
[](
user_op
::
InferContext
*
ctx
)
{
const
int64_t
n
=
ctx
->
InputShape
(
"input"
,
0
).
elem_cnt
();
size_t
tmp_buffer_size
=
0
;
if
(
ctx
->
Attr
<
bool
>
(
"has_pos_weight"
))
{
tmp_buffer_size
+=
GetCudaAlignedSize
(
n
*
sizeof
(
T
));
}
return
tmp_buffer_size
;
};
}
template
<
typename
T
>
user_op
::
InferTmpSizeFn
GenBwInferTmpSizeFn
()
{
return
[](
user_op
::
InferContext
*
ctx
)
{
const
int64_t
n
=
ctx
->
InputShape
(
"target"
,
0
).
elem_cnt
();
size_t
tmp_buffer_size
=
0
;
if
(
ctx
->
Attr
<
bool
>
(
"has_pos_weight"
))
{
tmp_buffer_size
+=
GetCudaAlignedSize
(
n
*
sizeof
(
T
));
}
return
tmp_buffer_size
;
};
}
}
// namespace
#define REGISTER_BINARY_CROSS_ENTROPY_KERNEL(dtype) \
REGISTER_USER_KERNEL
(
"binary_cross_entropy_with_logits"
)
\
.
SetCreateFn
<
BinaryCrossEntropyWithLogitsKernel
<
dtype
>>
()
\
.
SetIsMatchedHob
((
user_op
::
HobDeviceType
()
==
DeviceType
::
kCUDA
)
\
&&
(
user_op
::
HobDataType
(
"input"
,
0
)
==
GetDataType
<
dtype
>::
value
)
\
&&
(
user_op
::
HobDataType
(
"target"
,
0
)
==
GetDataType
<
dtype
>::
value
)
\
&&
(
user_op
::
HobDataType
(
"out"
,
0
)
==
GetDataType
<
dtype
>::
value
))
\
.
SetInferTmpSizeFn
(
GenFwInferTmpSizeFn
<
dtype
>
());
#define REGISTER_BINARY_CROSS_ENTROPY_GRAD_KERNEL(dtype) \
REGISTER_USER_KERNEL
(
"binary_cross_entropy_with_logits_grad"
)
\
.
SetCreateFn
<
BinaryCrossEntropyWithLogitsGradKernel
<
dtype
>>
()
\
.
SetIsMatchedHob
((
user_op
::
HobDeviceType
()
==
DeviceType
::
kCUDA
)
\
&&
(
user_op
::
HobDataType
(
"input"
,
0
)
==
GetDataType
<
dtype
>::
value
)
\
&&
(
user_op
::
HobDataType
(
"target"
,
0
)
==
GetDataType
<
dtype
>::
value
)
\
&&
(
user_op
::
HobDataType
(
"dy"
,
0
)
==
GetDataType
<
dtype
>::
value
)
\
&&
(
user_op
::
HobDataType
(
"dx"
,
0
)
==
GetDataType
<
dtype
>::
value
))
\
.
SetInferTmpSizeFn
(
GenBwInferTmpSizeFn
<
dtype
>
());
REGISTER_BINARY_CROSS_ENTROPY_KERNEL
(
half
)
REGISTER_BINARY_CROSS_ENTROPY_KERNEL
(
float
)
REGISTER_BINARY_CROSS_ENTROPY_KERNEL
(
double
)
REGISTER_BINARY_CROSS_ENTROPY_GRAD_KERNEL
(
half
)
REGISTER_BINARY_CROSS_ENTROPY_GRAD_KERNEL
(
float
)
REGISTER_BINARY_CROSS_ENTROPY_GRAD_KERNEL
(
double
)
}
// namespace user_op
/*
Copyright 2020 The OneFlow Authors. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
#include "oneflow/core/framework/framework.h"
#include "oneflow/core/hip/elementwise.hip.h"
#include "oneflow/core/ndarray/ndarray_util.h"
#include "oneflow/core/ndarray/xpu_var_ndarray.h"
#include "oneflow/user/kernels/loss_kernel_util.h"
#include "oneflow/core/ep/rocm/cuda_stream.h"
namespace
oneflow
{
namespace
user_op
{
namespace
{
using
namespace
loss
;
enum
class
WeightType
{
kNone
,
kWeight
,
kPosWeight
,
kBoth
,
};
template
<
typename
T
,
WeightType
WEIGHT_TYPE
>
struct
BinaryCrossEntropyWithLogitsFunctor
;
template
<
typename
T
>
struct
BinaryCrossEntropyWithLogitsFunctor
<
T
,
WeightType
::
kNone
>
{
T
zero_
;
T
one_
;
BinaryCrossEntropyWithLogitsFunctor
()
:
zero_
(
GetZeroVal
<
T
>
()),
one_
(
GetOneVal
<
T
>
())
{}
__device__
__forceinline__
T
operator
()(
T
input_val
,
T
target_val
)
const
{
const
T
max_val
=
-
input_val
<
zero_
?
zero_
:
-
input_val
;
return
(
one_
-
target_val
)
*
input_val
+
max_val
+
(
log
(
exp
(
-
max_val
)
+
exp
(
-
input_val
-
max_val
)));
}
};
template
<
typename
T
>
struct
BinaryCrossEntropyWithLogitsFunctor
<
T
,
WeightType
::
kPosWeight
>
{
T
zero_
;
T
one_
;
BinaryCrossEntropyWithLogitsFunctor
()
:
zero_
(
GetZeroVal
<
T
>
()),
one_
(
GetOneVal
<
T
>
())
{}
__device__
__forceinline__
T
operator
()(
T
input_val
,
T
target_val
,
T
weight_val
)
const
{
const
T
max_val
=
-
input_val
<
zero_
?
zero_
:
-
input_val
;
const
T
pos_weight_processed_val
=
weight_val
-
target_val
+
one_
;
return
(
one_
-
target_val
)
*
input_val
+
(
pos_weight_processed_val
*
(
log
(
exp
(
-
max_val
)
+
exp
(
-
input_val
-
max_val
))
+
max_val
));
}
};
template
<
>
struct
BinaryCrossEntropyWithLogitsFunctor
<
float
,
WeightType
::
kNone
>
{
float
zero_
;
float
one_
;
BinaryCrossEntropyWithLogitsFunctor
()
:
zero_
(
0.
f
),
one_
(
1.
f
)
{}
__device__
__forceinline__
float
operator
()(
float
input_val
,
float
target_val
)
const
{
const
float
max_val
=
-
input_val
<
zero_
?
zero_
:
-
input_val
;
return
(
one_
-
target_val
)
*
input_val
+
max_val
+
(
logf
(
expf
(
-
max_val
)
+
expf
(
-
input_val
-
max_val
)));
}
};
template
<
>
struct
BinaryCrossEntropyWithLogitsFunctor
<
float
,
WeightType
::
kPosWeight
>
{
float
zero_
;
float
one_
;
BinaryCrossEntropyWithLogitsFunctor
()
:
zero_
(
0.
f
),
one_
(
1.
f
)
{}
__device__
__forceinline__
float
operator
()(
float
input_val
,
float
target_val
,
float
weight_val
)
const
{
const
float
max_val
=
-
input_val
<
zero_
?
zero_
:
-
input_val
;
const
float
pos_weight_processed_val
=
weight_val
-
target_val
+
one_
;
return
(
one_
-
target_val
)
*
input_val
+
(
pos_weight_processed_val
*
(
logf
(
expf
(
-
max_val
)
+
expf
(
-
input_val
-
max_val
))
+
max_val
));
}
};
template
<
typename
T
>
struct
BinaryCrossEntropyWithLogitsFunctor
<
T
,
WeightType
::
kWeight
>
{
BinaryCrossEntropyWithLogitsFunctor
<
T
,
WeightType
::
kNone
>
f
;
__device__
__forceinline__
T
operator
()(
T
input_val
,
T
target_val
,
T
weight_val
)
const
{
return
f
(
input_val
,
target_val
)
*
weight_val
;
}
};
template
<
typename
T
>
struct
BinaryCrossEntropyWithLogitsFunctor
<
T
,
WeightType
::
kBoth
>
{
BinaryCrossEntropyWithLogitsFunctor
<
T
,
WeightType
::
kPosWeight
>
f
;
__device__
__forceinline__
T
operator
()(
T
input_val
,
T
target_val
,
T
weight_val
,
T
pos_weight_val
)
const
{
return
f
(
input_val
,
target_val
,
pos_weight_val
)
*
weight_val
;
}
};
template
<
>
struct
BinaryCrossEntropyWithLogitsFunctor
<
half
,
WeightType
::
kNone
>
{
BinaryCrossEntropyWithLogitsFunctor
<
float
,
WeightType
::
kNone
>
f
;
__device__
__forceinline__
half
operator
()(
half
input_val
,
half
target_val
)
const
{
return
__float2half
(
f
(
__half2float
(
input_val
),
__half2float
(
target_val
)));
}
};
template
<
>
struct
BinaryCrossEntropyWithLogitsFunctor
<
half
,
WeightType
::
kPosWeight
>
{
BinaryCrossEntropyWithLogitsFunctor
<
float
,
WeightType
::
kPosWeight
>
f
;
__device__
__forceinline__
half
operator
()(
half
input_val
,
half
target_val
,
half
weight_val
)
const
{
return
__float2half
(
f
(
__half2float
(
input_val
),
__half2float
(
target_val
),
__half2float
(
weight_val
)));
}
};
template
<
>
struct
BinaryCrossEntropyWithLogitsFunctor
<
half
,
WeightType
::
kWeight
>
{
BinaryCrossEntropyWithLogitsFunctor
<
float
,
WeightType
::
kWeight
>
f
;
__device__
__forceinline__
half
operator
()(
half
input_val
,
half
target_val
,
half
weight_val
)
const
{
return
__float2half
(
f
(
__half2float
(
input_val
),
__half2float
(
target_val
),
__half2float
(
weight_val
)));
}
};
template
<
>
struct
BinaryCrossEntropyWithLogitsFunctor
<
half
,
WeightType
::
kBoth
>
{
BinaryCrossEntropyWithLogitsFunctor
<
float
,
WeightType
::
kBoth
>
f
;
__device__
__forceinline__
half
operator
()(
half
input_val
,
half
target_val
,
half
weight_val
,
half
pos_weight_val
)
const
{
return
__float2half
(
f
(
__half2float
(
input_val
),
__half2float
(
target_val
),
__half2float
(
weight_val
),
__half2float
(
pos_weight_val
)));
}
};
template
<
typename
T
>
__device__
__forceinline__
T
CalSigmoid
(
const
T
x
)
{
const
T
half_of_one
=
static_cast
<
T
>
(
0.5
);
return
half_of_one
*
tanh
(
half_of_one
*
x
)
+
half_of_one
;
}
template
<
>
__device__
__forceinline__
float
CalSigmoid
(
const
float
x
)
{
const
float
half_of_one
=
static_cast
<
float
>
(
0.5
);
return
half_of_one
*
tanhf
(
half_of_one
*
x
)
+
half_of_one
;
}
template
<
>
__device__
__forceinline__
half
CalSigmoid
(
const
half
x
)
{
return
__float2half
(
CalSigmoid
(
__half2float
(
x
)));
}
template
<
typename
T
,
WeightType
WEIGHT_TYPE
>
struct
BinaryCrossEntropyWithLogitsGradFunctor
;
template
<
typename
T
>
struct
BinaryCrossEntropyWithLogitsGradFunctor
<
T
,
WeightType
::
kNone
>
{
__device__
__forceinline__
T
operator
()(
T
input_val
,
T
target_val
,
T
dy_val
)
const
{
return
(
CalSigmoid
(
input_val
)
-
target_val
)
*
dy_val
;
}
};
template
<
typename
T
>
struct
BinaryCrossEntropyWithLogitsGradFunctor
<
T
,
WeightType
::
kPosWeight
>
{
T
one_
;
BinaryCrossEntropyWithLogitsGradFunctor
()
:
one_
(
GetOneVal
<
T
>
())
{}
__device__
__forceinline__
T
operator
()(
T
input_val
,
T
target_val
,
T
dy_val
,
T
weight_val
)
const
{
return
dy_val
*
((
weight_val
+
one_
-
target_val
)
*
CalSigmoid
(
input_val
)
-
weight_val
);
}
};
template
<
typename
T
>
struct
BinaryCrossEntropyWithLogitsGradFunctor
<
T
,
WeightType
::
kWeight
>
{
BinaryCrossEntropyWithLogitsGradFunctor
<
T
,
WeightType
::
kNone
>
f
;
__device__
__forceinline__
T
operator
()(
T
input_val
,
T
target_val
,
T
dy_val
,
T
weight_val
)
const
{
return
f
(
input_val
,
target_val
,
dy_val
)
*
weight_val
;
}
};
template
<
typename
T
>
struct
BinaryCrossEntropyWithLogitsGradFunctor
<
T
,
WeightType
::
kBoth
>
{
BinaryCrossEntropyWithLogitsGradFunctor
<
T
,
WeightType
::
kPosWeight
>
f
;
__device__
__forceinline__
T
operator
()(
T
input_val
,
T
target_val
,
T
dy_val
,
T
weight_val
,
T
pos_weight_val
)
const
{
return
f
(
input_val
,
target_val
,
dy_val
,
pos_weight_val
)
*
weight_val
;
}
};
template
<
typename
T
>
class
BinaryCrossEntropyWithLogitsKernel
final
:
public
user_op
::
OpKernel
{
public:
BinaryCrossEntropyWithLogitsKernel
()
=
default
;
~
BinaryCrossEntropyWithLogitsKernel
()
override
=
default
;
private:
using
user_op
::
OpKernel
::
Compute
;
void
Compute
(
user_op
::
KernelComputeContext
*
ctx
)
const
override
{
const
auto
*
input_blob
=
ctx
->
Tensor4ArgNameAndIndex
(
"input"
,
0
);
const
auto
*
target_blob
=
ctx
->
Tensor4ArgNameAndIndex
(
"target"
,
0
);
auto
*
out_blob
=
ctx
->
Tensor4ArgNameAndIndex
(
"out"
,
0
);
auto
*
tmp_buffer_blob
=
ctx
->
Tensor4ArgNameAndIndex
(
"tmp_buffer"
,
0
);
const
int64_t
elem_cnt
=
input_blob
->
shape_view
().
elem_cnt
();
const
T
*
input
=
input_blob
->
dptr
<
T
>
();
const
T
*
target
=
target_blob
->
dptr
<
T
>
();
T
*
out
=
out_blob
->
mut_dptr
<
T
>
();
if
(
ctx
->
Attr
<
bool
>
(
"has_pos_weight"
))
{
T
*
pos_weight_processed
=
tmp_buffer_blob
->
mut_dptr
<
T
>
();
const
T
*
pos_weight
=
ctx
->
Tensor4ArgNameAndIndex
(
"pos_weight"
,
0
)
->
dptr
<
T
>
();
Shape
pos_weight_shape
=
Shape
::
Ones
(
target_blob
->
shape_view
().
NumAxes
());
pos_weight_shape
.
Set
(
pos_weight_shape
.
NumAxes
()
-
1
,
ctx
->
Tensor4ArgNameAndIndex
(
"pos_weight"
,
0
)
->
shape_view
().
elem_cnt
());
NdarrayUtil
<
DeviceType
::
kCUDA
,
T
>::
BroadcastMul
(
ctx
->
stream
(),
XpuVarNdarray
<
T
>
(
target_blob
->
shape_view
(),
pos_weight_processed
),
XpuVarNdarray
<
const
T
>
(
pos_weight_shape
,
pos_weight
),
XpuVarNdarray
<
const
T
>
(
target_blob
->
shape_view
(),
target
));
if
(
ctx
->
has_input
(
"weight"
,
0
))
{
const
T
*
weight
=
ctx
->
Tensor4ArgNameAndIndex
(
"weight"
,
0
)
->
dptr
<
T
>
();
using
FunctorT
=
BinaryCrossEntropyWithLogitsFunctor
<
T
,
WeightType
::
kBoth
>
;
using
FactoryT
=
cuda
::
elementwise
::
SimpleFactory
<
FunctorT
>
;
OF_CUDA_CHECK
((
cuda
::
elementwise
::
GenericLauncher
<
FactoryT
,
T
,
T
,
T
,
T
,
T
>::
Launch
(
FactoryT
(
FunctorT
()),
elem_cnt
,
out
,
input
,
target
,
weight
,
pos_weight_processed
,
ctx
->
stream
()
->
As
<
ep
::
CudaStream
>
()
->
cuda_stream
())));
}
else
{
OF_CUDA_CHECK
((
cuda
::
elementwise
::
Ternary
(
BinaryCrossEntropyWithLogitsFunctor
<
T
,
WeightType
::
kPosWeight
>
(),
elem_cnt
,
out
,
input
,
target
,
pos_weight_processed
,
ctx
->
stream
()
->
As
<
ep
::
CudaStream
>
()
->
cuda_stream
())));
}
}
else
{
if
(
ctx
->
has_input
(
"weight"
,
0
))
{
const
T
*
weight
=
ctx
->
Tensor4ArgNameAndIndex
(
"weight"
,
0
)
->
dptr
<
T
>
();
OF_CUDA_CHECK
((
cuda
::
elementwise
::
Ternary
(
BinaryCrossEntropyWithLogitsFunctor
<
T
,
WeightType
::
kWeight
>
(),
elem_cnt
,
out
,
input
,
target
,
weight
,
ctx
->
stream
()
->
As
<
ep
::
CudaStream
>
()
->
cuda_stream
())));
}
else
{
OF_CUDA_CHECK
((
cuda
::
elementwise
::
Binary
(
BinaryCrossEntropyWithLogitsFunctor
<
T
,
WeightType
::
kNone
>
(),
elem_cnt
,
out
,
input
,
target
,
ctx
->
stream
()
->
As
<
ep
::
CudaStream
>
()
->
cuda_stream
())));
}
}
}
bool
AlwaysComputeWhenAllOutputsEmpty
()
const
override
{
return
false
;
}
};
template
<
typename
T
>
class
BinaryCrossEntropyWithLogitsGradKernel
final
:
public
user_op
::
OpKernel
{
public:
BinaryCrossEntropyWithLogitsGradKernel
()
=
default
;
~
BinaryCrossEntropyWithLogitsGradKernel
()
=
default
;
private:
using
user_op
::
OpKernel
::
Compute
;
void
Compute
(
user_op
::
KernelComputeContext
*
ctx
)
const
override
{
const
auto
*
input_blob
=
ctx
->
Tensor4ArgNameAndIndex
(
"input"
,
0
);
const
auto
*
target_blob
=
ctx
->
Tensor4ArgNameAndIndex
(
"target"
,
0
);
const
auto
*
dy_blob
=
ctx
->
Tensor4ArgNameAndIndex
(
"dy"
,
0
);
auto
*
dx_blob
=
ctx
->
Tensor4ArgNameAndIndex
(
"dx"
,
0
);
auto
*
tmp_buffer_blob
=
ctx
->
Tensor4ArgNameAndIndex
(
"tmp_buffer"
,
0
);
const
int64_t
elem_cnt
=
input_blob
->
shape_view
().
elem_cnt
();
const
T
*
dy
=
dy_blob
->
dptr
<
T
>
();
const
T
*
input
=
input_blob
->
dptr
<
T
>
();
const
T
*
target
=
target_blob
->
dptr
<
T
>
();
T
*
dx
=
dx_blob
->
mut_dptr
<
T
>
();
if
(
ctx
->
Attr
<
bool
>
(
"has_pos_weight"
))
{
T
*
pos_weight_processed
=
tmp_buffer_blob
->
mut_dptr
<
T
>
();
const
T
*
pos_weight
=
ctx
->
Tensor4ArgNameAndIndex
(
"pos_weight"
,
0
)
->
dptr
<
T
>
();
Shape
pos_weight_shape
=
Shape
::
Ones
(
target_blob
->
shape_view
().
NumAxes
());
pos_weight_shape
.
Set
(
pos_weight_shape
.
NumAxes
()
-
1
,
ctx
->
Tensor4ArgNameAndIndex
(
"pos_weight"
,
0
)
->
shape_view
().
elem_cnt
());
NdarrayUtil
<
DeviceType
::
kCUDA
,
T
>::
BroadcastMul
(
ctx
->
stream
(),
XpuVarNdarray
<
T
>
(
target_blob
->
shape_view
(),
pos_weight_processed
),
XpuVarNdarray
<
const
T
>
(
pos_weight_shape
,
pos_weight
),
XpuVarNdarray
<
const
T
>
(
target_blob
->
shape_view
(),
target
));
if
(
ctx
->
has_input
(
"weight"
,
0
))
{
const
T
*
weight
=
ctx
->
Tensor4ArgNameAndIndex
(
"weight"
,
0
)
->
dptr
<
T
>
();
using
FunctorT
=
BinaryCrossEntropyWithLogitsGradFunctor
<
T
,
WeightType
::
kBoth
>
;
using
FactoryT
=
cuda
::
elementwise
::
SimpleFactory
<
FunctorT
>
;
OF_CUDA_CHECK
((
cuda
::
elementwise
::
GenericLauncher
<
FactoryT
,
T
,
T
,
T
,
T
,
T
,
T
>::
Launch
(
FactoryT
(
FunctorT
()),
elem_cnt
,
dx
,
input
,
target
,
dy
,
weight
,
pos_weight_processed
,
ctx
->
stream
()
->
As
<
ep
::
CudaStream
>
()
->
cuda_stream
())));
}
else
{
using
FunctorT
=
BinaryCrossEntropyWithLogitsGradFunctor
<
T
,
WeightType
::
kPosWeight
>
;
using
FactoryT
=
cuda
::
elementwise
::
SimpleFactory
<
FunctorT
>
;
OF_CUDA_CHECK
((
cuda
::
elementwise
::
GenericLauncher
<
FactoryT
,
T
,
T
,
T
,
T
,
T
>::
Launch
(
FactoryT
(
FunctorT
()),
elem_cnt
,
dx
,
input
,
target
,
dy
,
pos_weight_processed
,
ctx
->
stream
()
->
As
<
ep
::
CudaStream
>
()
->
cuda_stream
())));
}
}
else
{
if
(
ctx
->
has_input
(
"weight"
,
0
))
{
const
T
*
weight
=
ctx
->
Tensor4ArgNameAndIndex
(
"weight"
,
0
)
->
dptr
<
T
>
();
using
FunctorT
=
BinaryCrossEntropyWithLogitsGradFunctor
<
T
,
WeightType
::
kWeight
>
;
using
FactoryT
=
cuda
::
elementwise
::
SimpleFactory
<
FunctorT
>
;
OF_CUDA_CHECK
((
cuda
::
elementwise
::
GenericLauncher
<
FactoryT
,
T
,
T
,
T
,
T
,
T
>::
Launch
(
FactoryT
(
FunctorT
()),
elem_cnt
,
dx
,
input
,
target
,
dy
,
weight
,
ctx
->
stream
()
->
As
<
ep
::
CudaStream
>
()
->
cuda_stream
())));
}
else
{
OF_CUDA_CHECK
((
cuda
::
elementwise
::
Ternary
(
BinaryCrossEntropyWithLogitsGradFunctor
<
T
,
WeightType
::
kNone
>
(),
elem_cnt
,
dx
,
input
,
target
,
dy
,
ctx
->
stream
()
->
As
<
ep
::
CudaStream
>
()
->
cuda_stream
())));
}
}
}
bool
AlwaysComputeWhenAllOutputsEmpty
()
const
override
{
return
false
;
}
};
template
<
typename
T
>
user_op
::
InferTmpSizeFn
GenFwInferTmpSizeFn
()
{
return
[](
user_op
::
InferContext
*
ctx
)
{
const
int64_t
n
=
ctx
->
InputShape
(
"input"
,
0
).
elem_cnt
();
size_t
tmp_buffer_size
=
0
;
if
(
ctx
->
Attr
<
bool
>
(
"has_pos_weight"
))
{
tmp_buffer_size
+=
GetCudaAlignedSize
(
n
*
sizeof
(
T
));
}
return
tmp_buffer_size
;
};
}
template
<
typename
T
>
user_op
::
InferTmpSizeFn
GenBwInferTmpSizeFn
()
{
return
[](
user_op
::
InferContext
*
ctx
)
{
const
int64_t
n
=
ctx
->
InputShape
(
"target"
,
0
).
elem_cnt
();
size_t
tmp_buffer_size
=
0
;
if
(
ctx
->
Attr
<
bool
>
(
"has_pos_weight"
))
{
tmp_buffer_size
+=
GetCudaAlignedSize
(
n
*
sizeof
(
T
));
}
return
tmp_buffer_size
;
};
}
}
// namespace
#define REGISTER_BINARY_CROSS_ENTROPY_KERNEL(dtype) \
REGISTER_USER_KERNEL("binary_cross_entropy_with_logits") \
.SetCreateFn<BinaryCrossEntropyWithLogitsKernel<dtype>>() \
.SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \
&& (user_op::HobDataType("input", 0) == GetDataType<dtype>::value) \
&& (user_op::HobDataType("target", 0) == GetDataType<dtype>::value) \
&& (user_op::HobDataType("out", 0) == GetDataType<dtype>::value)) \
.SetInferTmpSizeFn(GenFwInferTmpSizeFn<dtype>());
#define REGISTER_BINARY_CROSS_ENTROPY_GRAD_KERNEL(dtype) \
REGISTER_USER_KERNEL("binary_cross_entropy_with_logits_grad") \
.SetCreateFn<BinaryCrossEntropyWithLogitsGradKernel<dtype>>() \
.SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \
&& (user_op::HobDataType("input", 0) == GetDataType<dtype>::value) \
&& (user_op::HobDataType("target", 0) == GetDataType<dtype>::value) \
&& (user_op::HobDataType("dy", 0) == GetDataType<dtype>::value) \
&& (user_op::HobDataType("dx", 0) == GetDataType<dtype>::value)) \
.SetInferTmpSizeFn(GenBwInferTmpSizeFn<dtype>());
REGISTER_BINARY_CROSS_ENTROPY_KERNEL
(
half
)
REGISTER_BINARY_CROSS_ENTROPY_KERNEL
(
float
)
REGISTER_BINARY_CROSS_ENTROPY_KERNEL
(
double
)
REGISTER_BINARY_CROSS_ENTROPY_GRAD_KERNEL
(
half
)
REGISTER_BINARY_CROSS_ENTROPY_GRAD_KERNEL
(
float
)
REGISTER_BINARY_CROSS_ENTROPY_GRAD_KERNEL
(
double
)
}
// namespace user_op
}
// namespace oneflow
\ No newline at end of file
oneflow/user/kernels/binary_cross_entropy_with_logits_mean_kernel.hip.cpp
View file @
8f7de847
#include "hip/hip_runtime.h"
/*
Copyright 2020 The OneFlow Authors. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
#include "oneflow/user/kernels/binary_cross_entropy_with_logits_mean_kernel_util.h"
#include "oneflow/core/ep/rocm/cuda_stream.h"
#include "oneflow/core/hip/elementwise.hip.h"
#include <hipcub/hipcub.hpp>
#include "oneflow/core/kernel/cuda_graph_support.h"
namespace
oneflow
{
namespace
user_op
{
namespace
{
constexpr
int32_t
kBlockSize
=
1024
;
constexpr
int32_t
kReduceLocalSumBlockSize
=
1024
;
constexpr
int32_t
kSingleBlockProcessNumThreshold
=
1024
;
template
<
typename
T
>
struct
DefaultComputeType
{
using
type
=
T
;
};
template
<
>
struct
DefaultComputeType
<
half
>
{
using
type
=
float
;
};
template
<
class
Func
>
inline
hipError_t
GetNumBlocks
(
Func
func
,
int64_t
block_size
,
size_t
dynamic_smem_size
,
int64_t
max_blocks
,
int64_t
waves
,
int
*
num_blocks
)
{
int
dev
;
{
hipError_t
err
=
hipGetDevice
(
&
dev
);
if
(
err
!=
hipSuccess
)
{
return
err
;
}
}
int
sm_count
;
{
hipError_t
err
=
hipDeviceGetAttribute
(
&
sm_count
,
hipDeviceAttributeMultiprocessorCount
,
dev
);
if
(
err
!=
hipSuccess
)
{
return
err
;
}
}
int
max_active_blocks
;
{
hipError_t
err
=
hipOccupancyMaxActiveBlocksPerMultiprocessor
(
&
max_active_blocks
,
func
,
block_size
,
dynamic_smem_size
);
}
*
num_blocks
=
std
::
max
<
int
>
(
1
,
std
::
min
<
int64_t
>
(
max_blocks
,
sm_count
*
max_active_blocks
*
waves
));
return
hipSuccess
;
}
template
<
typename
In
,
typename
Out
,
typename
ComputeType
>
__global__
void
FusedBinaryCrossEntropyWithLogitsReduceMeanKernel
(
const
In
*
input
,
const
In
*
target
,
Out
*
out
,
const
int32_t
local_elem_cnt
,
const
int32_t
reduce_elem_cnt
)
{
ComputeType
zero
=
static_cast
<
ComputeType
>
(
0.0
);
ComputeType
one
=
static_cast
<
ComputeType
>
(
1.0
);
using
BlockReduce
=
hipcub
::
BlockReduce
<
ComputeType
,
kBlockSize
>
;
__shared__
typename
BlockReduce
::
TempStorage
temp_storage
;
ComputeType
reduce_sum
=
0.0
;
CUDA_1D_KERNEL_LOOP
(
i
,
local_elem_cnt
)
{
const
ComputeType
input_val
=
static_cast
<
ComputeType
>
(
input
[
i
]);
const
ComputeType
target_val
=
static_cast
<
ComputeType
>
(
target
[
i
]);
const
ComputeType
max_val
=
-
input_val
<
zero
?
zero
:
-
input_val
;
const
ComputeType
result
=
(
one
-
target_val
)
*
input_val
+
max_val
+
(
log
(
exp
(
-
max_val
)
+
exp
(
-
input_val
-
max_val
)));
reduce_sum
+=
result
;
}
const
ComputeType
block_reduce_sum
=
BlockReduce
(
temp_storage
).
Sum
(
reduce_sum
);
if
(
threadIdx
.
x
==
0
)
{
out
[
blockIdx
.
x
]
=
static_cast
<
Out
>
(
block_reduce_sum
/
reduce_elem_cnt
);
}
}
template
<
typename
Out
,
typename
ComputeType
>
__global__
void
ReduceLocalSumKernel
(
ComputeType
*
block_local_sum_buf
,
Out
*
out
,
int64_t
elem_cnt
)
{
using
BlockReduce
=
hipcub
::
BlockReduce
<
ComputeType
,
kReduceLocalSumBlockSize
>
;
__shared__
typename
BlockReduce
::
TempStorage
temp_storage
;
ComputeType
reduce_sum
=
0.0
;
CUDA_1D_KERNEL_LOOP
(
i
,
elem_cnt
)
{
reduce_sum
+=
block_local_sum_buf
[
i
];
}
const
ComputeType
block_reduce_sum
=
BlockReduce
(
temp_storage
).
Sum
(
reduce_sum
);
if
(
threadIdx
.
x
==
0
)
{
out
[
0
]
=
static_cast
<
Out
>
(
block_reduce_sum
);
}
}
template
<
typename
T
>
__device__
__forceinline__
T
Sigmoid
(
const
T
x
)
{
const
T
half_of_one
=
static_cast
<
T
>
(
0.5
);
return
half_of_one
*
tanh
(
half_of_one
*
x
)
+
half_of_one
;
}
template
<
>
__device__
__forceinline__
half
Sigmoid
(
const
half
x
)
{
return
__float2half
(
Sigmoid
(
__half2float
(
x
)));
}
template
<
typename
T
,
typename
ComputeType
>
struct
BinaryCrossEntropyWithLogitsReduceMeanGradFunctor
{
OF_DEVICE_FUNC
explicit
BinaryCrossEntropyWithLogitsReduceMeanGradFunctor
(
const
T
elem_cnt_reciprocal
,
const
T
dy
)
:
elem_cnt_reciprocal
(
elem_cnt_reciprocal
),
dy
(
dy
)
{}
__device__
T
operator
()(
const
T
input_val
,
const
T
target_val
)
const
{
return
(
Sigmoid
(
input_val
)
-
target_val
)
*
dy
*
elem_cnt_reciprocal
;
}
const
T
dy
;
const
T
elem_cnt_reciprocal
;
};
template
<
typename
T
,
typename
ComputeType
>
struct
BinaryCrossEntropyWithLogitsReduceMeanGradDyptrFunctor
{
OF_DEVICE_FUNC
explicit
BinaryCrossEntropyWithLogitsReduceMeanGradDyptrFunctor
(
const
int32_t
elem_cnt
,
const
T
*
dy_ptr
)
:
elem_cnt_reciprocal
(
1.0
f
/
elem_cnt
),
dy_ptr
(
dy_ptr
)
{}
__device__
BinaryCrossEntropyWithLogitsReduceMeanGradFunctor
<
T
,
ComputeType
>
operator
()()
const
{
return
BinaryCrossEntropyWithLogitsReduceMeanGradFunctor
<
T
,
ComputeType
>
(
elem_cnt_reciprocal
,
*
dy_ptr
);
}
const
T
*
dy_ptr
;
const
T
elem_cnt_reciprocal
;
};
template
<
typename
T
>
class
BinaryCrossEntropyWithLogitsMeanKernel
final
:
public
user_op
::
OpKernel
,
public
CudaGraphSupport
{
public:
BinaryCrossEntropyWithLogitsMeanKernel
()
=
default
;
~
BinaryCrossEntropyWithLogitsMeanKernel
()
override
=
default
;
bool
AlwaysComputeWhenAllOutputsEmpty
()
const
override
{
return
false
;
}
std
::
shared_ptr
<
user_op
::
OpKernelCache
>
InitOpKernelCache
(
user_op
::
KernelCacheContext
*
ctx
)
const
override
{
return
CreateBCEWithLogitsReduceMeanKernelCache
(
ctx
);
}
private:
using
user_op
::
OpKernel
::
Compute
;
void
Compute
(
user_op
::
KernelComputeContext
*
ctx
,
user_op
::
OpKernelState
*
state
,
const
user_op
::
OpKernelCache
*
cache
)
const
override
{
const
auto
*
input_blob
=
ctx
->
Tensor4ArgNameAndIndex
(
"input"
,
0
);
const
auto
*
target_blob
=
ctx
->
Tensor4ArgNameAndIndex
(
"target"
,
0
);
auto
*
out_blob
=
ctx
->
Tensor4ArgNameAndIndex
(
"out"
,
0
);
int64_t
local_elem_cnt
=
input_blob
->
shape_view
().
elem_cnt
();
int64_t
reduce_elem_cnt
=
local_elem_cnt
;
if
(
cache
!=
nullptr
)
{
// Because `out`'s SBP maybe P or B, we need to use reduce_elem_cnt as reduce_mean factor.
const
auto
*
bce_cache
=
dynamic_cast
<
const
BCEWithLogitsReduceMeanKernelCache
*>
(
cache
);
CHECK_NOTNULL
(
bce_cache
);
reduce_elem_cnt
=
bce_cache
->
reduce_elem_cnt
();
}
const
T
*
input
=
input_blob
->
dptr
<
T
>
();
const
T
*
target
=
target_blob
->
dptr
<
T
>
();
T
*
out
=
out_blob
->
mut_dptr
<
T
>
();
using
ComputeType
=
typename
DefaultComputeType
<
T
>::
type
;
if
(
local_elem_cnt
<=
kSingleBlockProcessNumThreshold
)
{
FusedBinaryCrossEntropyWithLogitsReduceMeanKernel
<
T
,
T
,
ComputeType
>
<<<
1
,
kBlockSize
,
0
,
ctx
->
stream
()
->
As
<
ep
::
CudaStream
>
()
->
cuda_stream
()
>>>
(
input_blob
->
dptr
<
T
>
(),
target_blob
->
dptr
<
T
>
(),
out_blob
->
mut_dptr
<
T
>
(),
local_elem_cnt
,
reduce_elem_cnt
);
}
else
{
auto
*
tmp_buffer
=
ctx
->
Tensor4ArgNameAndIndex
(
"tmp_buffer"
,
0
);
const
int64_t
tmp_buffer_elem_cnt
=
tmp_buffer
->
shape_view
().
elem_cnt
()
/
sizeof
(
T
);
const
int64_t
block_num
=
(
local_elem_cnt
+
kBlockSize
-
1
)
/
kBlockSize
;
int
launch_block
=
block_num
;
OF_CUDA_CHECK
(
GetNumBlocks
(
FusedBinaryCrossEntropyWithLogitsReduceMeanKernel
<
T
,
ComputeType
,
ComputeType
>
,
kBlockSize
,
0
,
block_num
,
32
,
&
launch_block
));
launch_block
=
std
::
min
<
int32_t
>
(
tmp_buffer_elem_cnt
,
launch_block
);
FusedBinaryCrossEntropyWithLogitsReduceMeanKernel
<
T
,
ComputeType
,
ComputeType
>
<<<
launch_block
,
kBlockSize
,
0
,
ctx
->
stream
()
->
As
<
ep
::
CudaStream
>
()
->
cuda_stream
()
>>>
(
input_blob
->
dptr
<
T
>
(),
target_blob
->
dptr
<
T
>
(),
tmp_buffer
->
mut_dptr
<
ComputeType
>
(),
local_elem_cnt
,
reduce_elem_cnt
);
ReduceLocalSumKernel
<
T
,
ComputeType
>
<<<
1
,
kReduceLocalSumBlockSize
,
0
,
ctx
->
stream
()
->
As
<
ep
::
CudaStream
>
()
->
cuda_stream
()
>>>
(
tmp_buffer
->
mut_dptr
<
ComputeType
>
(),
out_blob
->
mut_dptr
<
T
>
(),
block_num
);
}
}
};
template
<
typename
T
>
class
BinaryCrossEntropyWithLogitsReduceMeanGradKernel
final
:
public
user_op
::
OpKernel
{
public:
BinaryCrossEntropyWithLogitsReduceMeanGradKernel
()
=
default
;
~
BinaryCrossEntropyWithLogitsReduceMeanGradKernel
()
=
default
;
bool
AlwaysComputeWhenAllOutputsEmpty
()
const
override
{
return
false
;
}
std
::
shared_ptr
<
user_op
::
OpKernelCache
>
InitOpKernelCache
(
user_op
::
KernelCacheContext
*
ctx
)
const
override
{
return
CreateBCEWithLogitsReduceMeanKernelCache
(
ctx
);
}
private:
using
user_op
::
OpKernel
::
Compute
;
void
Compute
(
user_op
::
KernelComputeContext
*
ctx
,
user_op
::
OpKernelState
*
state
,
const
user_op
::
OpKernelCache
*
cache
)
const
override
{
const
auto
*
input_blob
=
ctx
->
Tensor4ArgNameAndIndex
(
"input"
,
0
);
const
auto
*
target_blob
=
ctx
->
Tensor4ArgNameAndIndex
(
"target"
,
0
);
const
auto
*
dy_blob
=
ctx
->
Tensor4ArgNameAndIndex
(
"dy"
,
0
);
auto
*
dx_blob
=
ctx
->
Tensor4ArgNameAndIndex
(
"dx"
,
0
);
int64_t
local_elem_cnt
=
input_blob
->
shape_view
().
elem_cnt
();
int64_t
reduce_elem_cnt
=
local_elem_cnt
;
if
(
cache
!=
nullptr
)
{
// Because `out`'s SBP maybe P or B, we need to use reduce_elem_cnt as reduce_mean factor.
const
auto
*
bce_cache
=
dynamic_cast
<
const
BCEWithLogitsReduceMeanKernelCache
*>
(
cache
);
CHECK_NOTNULL
(
bce_cache
);
reduce_elem_cnt
=
bce_cache
->
reduce_elem_cnt
();
}
const
T
*
dy
=
dy_blob
->
dptr
<
T
>
();
const
T
*
input
=
input_blob
->
dptr
<
T
>
();
const
T
*
target
=
target_blob
->
dptr
<
T
>
();
T
*
dx
=
dx_blob
->
mut_dptr
<
T
>
();
using
ComputeType
=
typename
DefaultComputeType
<
T
>::
type
;
OF_CUDA_CHECK
((
cuda
::
elementwise
::
BinaryWithFactory
(
BinaryCrossEntropyWithLogitsReduceMeanGradDyptrFunctor
<
T
,
ComputeType
>
(
reduce_elem_cnt
,
dy
),
local_elem_cnt
,
dx
,
input
,
target
,
ctx
->
stream
()
->
As
<
ep
::
CudaStream
>
()
->
cuda_stream
())));
}
};
}
// namespace
#define REGISTER_BINARY_CROSS_ENTROPY_REDUCE_MEAN_KERNEL(dtype) \
REGISTER_USER_KERNEL
(
"binary_cross_entropy_with_logits_reduce_mean"
)
\
.
SetCreateFn
<
BinaryCrossEntropyWithLogitsMeanKernel
<
dtype
>>
()
\
.
SetIsMatchedHob
((
user_op
::
HobDeviceType
()
==
DeviceType
::
kCUDA
)
\
&&
(
user_op
::
HobDataType
(
"input"
,
0
)
==
GetDataType
<
dtype
>::
value
)
\
&&
(
user_op
::
HobDataType
(
"target"
,
0
)
==
GetDataType
<
dtype
>::
value
)
\
&&
(
user_op
::
HobDataType
(
"out"
,
0
)
==
GetDataType
<
dtype
>::
value
))
\
.
SetInferTmpSizeFn
([](
user_op
::
InferContext
*
ctx
)
{
\
const
int64_t
elem_cnt
=
ctx
->
InputShape
(
"input"
,
0
).
elem_cnt
();
\
const
int64_t
block_num
=
(
elem_cnt
+
kBlockSize
-
1
)
/
kBlockSize
;
\
int
launch_block
=
block_num
;
\
using
ComputeType
=
typename
DefaultComputeType
<
dtype
>::
type
;
\
OF_CUDA_CHECK
(
GetNumBlocks
(
\
FusedBinaryCrossEntropyWithLogitsReduceMeanKernel
<
dtype
,
ComputeType
,
ComputeType
>
,
\
kBlockSize
,
0
,
block_num
,
32
,
&
launch_block
));
\
const
int64_t
tmp_buffer_size
=
GetCudaAlignedSize
(
launch_block
*
sizeof
(
dtype
));
\
return
tmp_buffer_size
;
\
});
#define REGISTER_BINARY_CROSS_ENTROPY_REDUCE_MEAN_GRAD_KERNEL(dtype) \
REGISTER_USER_KERNEL
(
"binary_cross_entropy_with_logits_reduce_mean_grad"
)
\
.
SetCreateFn
<
BinaryCrossEntropyWithLogitsReduceMeanGradKernel
<
dtype
>>
()
\
.
SetIsMatchedHob
((
user_op
::
HobDeviceType
()
==
DeviceType
::
kCUDA
)
\
&&
(
user_op
::
HobDataType
(
"input"
,
0
)
==
GetDataType
<
dtype
>::
value
)
\
&&
(
user_op
::
HobDataType
(
"target"
,
0
)
==
GetDataType
<
dtype
>::
value
)
\
&&
(
user_op
::
HobDataType
(
"dy"
,
0
)
==
GetDataType
<
dtype
>::
value
)
\
&&
(
user_op
::
HobDataType
(
"dx"
,
0
)
==
GetDataType
<
dtype
>::
value
));
REGISTER_BINARY_CROSS_ENTROPY_REDUCE_MEAN_KERNEL
(
half
)
REGISTER_BINARY_CROSS_ENTROPY_REDUCE_MEAN_KERNEL
(
float
)
REGISTER_BINARY_CROSS_ENTROPY_REDUCE_MEAN_KERNEL
(
double
)
REGISTER_BINARY_CROSS_ENTROPY_REDUCE_MEAN_GRAD_KERNEL
(
half
)
REGISTER_BINARY_CROSS_ENTROPY_REDUCE_MEAN_GRAD_KERNEL
(
float
)
REGISTER_BINARY_CROSS_ENTROPY_REDUCE_MEAN_GRAD_KERNEL
(
double
)
}
// namespace user_op
#include "hip/hip_runtime.h"
/*
Copyright 2020 The OneFlow Authors. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
#include "oneflow/user/kernels/binary_cross_entropy_with_logits_mean_kernel_util.h"
#include "oneflow/core/ep/rocm/cuda_stream.h"
#include "oneflow/core/hip/elementwise.hip.h"
#include <hipcub/hipcub.hpp>
#include "oneflow/core/kernel/cuda_graph_support.h"
namespace
oneflow
{
namespace
user_op
{
namespace
{
constexpr
int32_t
kBlockSize
=
1024
;
constexpr
int32_t
kReduceLocalSumBlockSize
=
1024
;
constexpr
int32_t
kSingleBlockProcessNumThreshold
=
1024
;
template
<
typename
T
>
struct
DefaultComputeType
{
using
type
=
T
;
};
template
<
>
struct
DefaultComputeType
<
half
>
{
using
type
=
float
;
};
template
<
class
Func
>
inline
hipError_t
GetNumBlocks
(
Func
func
,
int64_t
block_size
,
size_t
dynamic_smem_size
,
int64_t
max_blocks
,
int64_t
waves
,
int
*
num_blocks
)
{
int
dev
;
{
hipError_t
err
=
hipGetDevice
(
&
dev
);
if
(
err
!=
hipSuccess
)
{
return
err
;
}
}
int
sm_count
;
{
hipError_t
err
=
hipDeviceGetAttribute
(
&
sm_count
,
hipDeviceAttributeMultiprocessorCount
,
dev
);
if
(
err
!=
hipSuccess
)
{
return
err
;
}
}
int
max_active_blocks
;
{
hipError_t
err
=
hipOccupancyMaxActiveBlocksPerMultiprocessor
(
&
max_active_blocks
,
func
,
block_size
,
dynamic_smem_size
);
}
*
num_blocks
=
std
::
max
<
int
>
(
1
,
std
::
min
<
int64_t
>
(
max_blocks
,
sm_count
*
max_active_blocks
*
waves
));
return
hipSuccess
;
}
template
<
typename
In
,
typename
Out
,
typename
ComputeType
>
__global__
void
FusedBinaryCrossEntropyWithLogitsReduceMeanKernel
(
const
In
*
input
,
const
In
*
target
,
Out
*
out
,
const
int32_t
local_elem_cnt
,
const
int32_t
reduce_elem_cnt
)
{
ComputeType
zero
=
static_cast
<
ComputeType
>
(
0.0
);
ComputeType
one
=
static_cast
<
ComputeType
>
(
1.0
);
using
BlockReduce
=
hipcub
::
BlockReduce
<
ComputeType
,
kBlockSize
>
;
__shared__
typename
BlockReduce
::
TempStorage
temp_storage
;
ComputeType
reduce_sum
=
0.0
;
CUDA_1D_KERNEL_LOOP
(
i
,
local_elem_cnt
)
{
const
ComputeType
input_val
=
static_cast
<
ComputeType
>
(
input
[
i
]);
const
ComputeType
target_val
=
static_cast
<
ComputeType
>
(
target
[
i
]);
const
ComputeType
max_val
=
-
input_val
<
zero
?
zero
:
-
input_val
;
const
ComputeType
result
=
(
one
-
target_val
)
*
input_val
+
max_val
+
(
log
(
exp
(
-
max_val
)
+
exp
(
-
input_val
-
max_val
)));
reduce_sum
+=
result
;
}
const
ComputeType
block_reduce_sum
=
BlockReduce
(
temp_storage
).
Sum
(
reduce_sum
);
if
(
threadIdx
.
x
==
0
)
{
out
[
blockIdx
.
x
]
=
static_cast
<
Out
>
(
block_reduce_sum
/
reduce_elem_cnt
);
}
}
template
<
typename
Out
,
typename
ComputeType
>
__global__
void
ReduceLocalSumKernel
(
ComputeType
*
block_local_sum_buf
,
Out
*
out
,
int64_t
elem_cnt
)
{
using
BlockReduce
=
hipcub
::
BlockReduce
<
ComputeType
,
kReduceLocalSumBlockSize
>
;
__shared__
typename
BlockReduce
::
TempStorage
temp_storage
;
ComputeType
reduce_sum
=
0.0
;
CUDA_1D_KERNEL_LOOP
(
i
,
elem_cnt
)
{
reduce_sum
+=
block_local_sum_buf
[
i
];
}
const
ComputeType
block_reduce_sum
=
BlockReduce
(
temp_storage
).
Sum
(
reduce_sum
);
if
(
threadIdx
.
x
==
0
)
{
out
[
0
]
=
static_cast
<
Out
>
(
block_reduce_sum
);
}
}
template
<
typename
T
>
__device__
__forceinline__
T
Sigmoid
(
const
T
x
)
{
const
T
half_of_one
=
static_cast
<
T
>
(
0.5
);
return
half_of_one
*
tanh
(
half_of_one
*
x
)
+
half_of_one
;
}
template
<
>
__device__
__forceinline__
half
Sigmoid
(
const
half
x
)
{
return
__float2half
(
Sigmoid
(
__half2float
(
x
)));
}
template
<
typename
T
,
typename
ComputeType
>
struct
BinaryCrossEntropyWithLogitsReduceMeanGradFunctor
{
OF_DEVICE_FUNC
explicit
BinaryCrossEntropyWithLogitsReduceMeanGradFunctor
(
const
T
elem_cnt_reciprocal
,
const
T
dy
)
:
elem_cnt_reciprocal
(
elem_cnt_reciprocal
),
dy
(
dy
)
{}
__device__
T
operator
()(
const
T
input_val
,
const
T
target_val
)
const
{
return
(
Sigmoid
(
input_val
)
-
target_val
)
*
dy
*
elem_cnt_reciprocal
;
}
const
T
dy
;
const
T
elem_cnt_reciprocal
;
};
template
<
typename
T
,
typename
ComputeType
>
struct
BinaryCrossEntropyWithLogitsReduceMeanGradDyptrFunctor
{
OF_DEVICE_FUNC
explicit
BinaryCrossEntropyWithLogitsReduceMeanGradDyptrFunctor
(
const
int32_t
elem_cnt
,
const
T
*
dy_ptr
)
:
elem_cnt_reciprocal
(
1.0
f
/
elem_cnt
),
dy_ptr
(
dy_ptr
)
{}
__device__
BinaryCrossEntropyWithLogitsReduceMeanGradFunctor
<
T
,
ComputeType
>
operator
()()
const
{
return
BinaryCrossEntropyWithLogitsReduceMeanGradFunctor
<
T
,
ComputeType
>
(
elem_cnt_reciprocal
,
*
dy_ptr
);
}
const
T
*
dy_ptr
;
const
T
elem_cnt_reciprocal
;
};
template
<
typename
T
>
class
BinaryCrossEntropyWithLogitsMeanKernel
final
:
public
user_op
::
OpKernel
,
public
CudaGraphSupport
{
public:
BinaryCrossEntropyWithLogitsMeanKernel
()
=
default
;
~
BinaryCrossEntropyWithLogitsMeanKernel
()
override
=
default
;
bool
AlwaysComputeWhenAllOutputsEmpty
()
const
override
{
return
false
;
}
std
::
shared_ptr
<
user_op
::
OpKernelCache
>
InitOpKernelCache
(
user_op
::
KernelCacheContext
*
ctx
)
const
override
{
return
CreateBCEWithLogitsReduceMeanKernelCache
(
ctx
);
}
private:
using
user_op
::
OpKernel
::
Compute
;
void
Compute
(
user_op
::
KernelComputeContext
*
ctx
,
user_op
::
OpKernelState
*
state
,
const
user_op
::
OpKernelCache
*
cache
)
const
override
{
const
auto
*
input_blob
=
ctx
->
Tensor4ArgNameAndIndex
(
"input"
,
0
);
const
auto
*
target_blob
=
ctx
->
Tensor4ArgNameAndIndex
(
"target"
,
0
);
auto
*
out_blob
=
ctx
->
Tensor4ArgNameAndIndex
(
"out"
,
0
);
int64_t
local_elem_cnt
=
input_blob
->
shape_view
().
elem_cnt
();
int64_t
reduce_elem_cnt
=
local_elem_cnt
;
if
(
cache
!=
nullptr
)
{
// Because `out`'s SBP maybe P or B, we need to use reduce_elem_cnt as reduce_mean factor.
const
auto
*
bce_cache
=
dynamic_cast
<
const
BCEWithLogitsReduceMeanKernelCache
*>
(
cache
);
CHECK_NOTNULL
(
bce_cache
);
reduce_elem_cnt
=
bce_cache
->
reduce_elem_cnt
();
}
const
T
*
input
=
input_blob
->
dptr
<
T
>
();
const
T
*
target
=
target_blob
->
dptr
<
T
>
();
T
*
out
=
out_blob
->
mut_dptr
<
T
>
();
using
ComputeType
=
typename
DefaultComputeType
<
T
>::
type
;
if
(
local_elem_cnt
<=
kSingleBlockProcessNumThreshold
)
{
FusedBinaryCrossEntropyWithLogitsReduceMeanKernel
<
T
,
T
,
ComputeType
>
<<<
1
,
kBlockSize
,
0
,
ctx
->
stream
()
->
As
<
ep
::
CudaStream
>
()
->
cuda_stream
()
>>>
(
input_blob
->
dptr
<
T
>
(),
target_blob
->
dptr
<
T
>
(),
out_blob
->
mut_dptr
<
T
>
(),
local_elem_cnt
,
reduce_elem_cnt
);
}
else
{
auto
*
tmp_buffer
=
ctx
->
Tensor4ArgNameAndIndex
(
"tmp_buffer"
,
0
);
const
int64_t
tmp_buffer_elem_cnt
=
tmp_buffer
->
shape_view
().
elem_cnt
()
/
sizeof
(
T
);
const
int64_t
block_num
=
(
local_elem_cnt
+
kBlockSize
-
1
)
/
kBlockSize
;
int
launch_block
=
block_num
;
OF_CUDA_CHECK
(
GetNumBlocks
(
FusedBinaryCrossEntropyWithLogitsReduceMeanKernel
<
T
,
ComputeType
,
ComputeType
>
,
kBlockSize
,
0
,
block_num
,
32
,
&
launch_block
));
launch_block
=
std
::
min
<
int32_t
>
(
tmp_buffer_elem_cnt
,
launch_block
);
FusedBinaryCrossEntropyWithLogitsReduceMeanKernel
<
T
,
ComputeType
,
ComputeType
>
<<<
launch_block
,
kBlockSize
,
0
,
ctx
->
stream
()
->
As
<
ep
::
CudaStream
>
()
->
cuda_stream
()
>>>
(
input_blob
->
dptr
<
T
>
(),
target_blob
->
dptr
<
T
>
(),
tmp_buffer
->
mut_dptr
<
ComputeType
>
(),
local_elem_cnt
,
reduce_elem_cnt
);
ReduceLocalSumKernel
<
T
,
ComputeType
>
<<<
1
,
kReduceLocalSumBlockSize
,
0
,
ctx
->
stream
()
->
As
<
ep
::
CudaStream
>
()
->
cuda_stream
()
>>>
(
tmp_buffer
->
mut_dptr
<
ComputeType
>
(),
out_blob
->
mut_dptr
<
T
>
(),
block_num
);
}
}
};
template
<
typename
T
>
class
BinaryCrossEntropyWithLogitsReduceMeanGradKernel
final
:
public
user_op
::
OpKernel
{
public:
BinaryCrossEntropyWithLogitsReduceMeanGradKernel
()
=
default
;
~
BinaryCrossEntropyWithLogitsReduceMeanGradKernel
()
=
default
;
bool
AlwaysComputeWhenAllOutputsEmpty
()
const
override
{
return
false
;
}
std
::
shared_ptr
<
user_op
::
OpKernelCache
>
InitOpKernelCache
(
user_op
::
KernelCacheContext
*
ctx
)
const
override
{
return
CreateBCEWithLogitsReduceMeanKernelCache
(
ctx
);
}
private:
using
user_op
::
OpKernel
::
Compute
;
void
Compute
(
user_op
::
KernelComputeContext
*
ctx
,
user_op
::
OpKernelState
*
state
,
const
user_op
::
OpKernelCache
*
cache
)
const
override
{
const
auto
*
input_blob
=
ctx
->
Tensor4ArgNameAndIndex
(
"input"
,
0
);
const
auto
*
target_blob
=
ctx
->
Tensor4ArgNameAndIndex
(
"target"
,
0
);
const
auto
*
dy_blob
=
ctx
->
Tensor4ArgNameAndIndex
(
"dy"
,
0
);
auto
*
dx_blob
=
ctx
->
Tensor4ArgNameAndIndex
(
"dx"
,
0
);
int64_t
local_elem_cnt
=
input_blob
->
shape_view
().
elem_cnt
();
int64_t
reduce_elem_cnt
=
local_elem_cnt
;
if
(
cache
!=
nullptr
)
{
// Because `out`'s SBP maybe P or B, we need to use reduce_elem_cnt as reduce_mean factor.
const
auto
*
bce_cache
=
dynamic_cast
<
const
BCEWithLogitsReduceMeanKernelCache
*>
(
cache
);
CHECK_NOTNULL
(
bce_cache
);
reduce_elem_cnt
=
bce_cache
->
reduce_elem_cnt
();
}
const
T
*
dy
=
dy_blob
->
dptr
<
T
>
();
const
T
*
input
=
input_blob
->
dptr
<
T
>
();
const
T
*
target
=
target_blob
->
dptr
<
T
>
();
T
*
dx
=
dx_blob
->
mut_dptr
<
T
>
();
using
ComputeType
=
typename
DefaultComputeType
<
T
>::
type
;
OF_CUDA_CHECK
((
cuda
::
elementwise
::
BinaryWithFactory
(
BinaryCrossEntropyWithLogitsReduceMeanGradDyptrFunctor
<
T
,
ComputeType
>
(
reduce_elem_cnt
,
dy
),
local_elem_cnt
,
dx
,
input
,
target
,
ctx
->
stream
()
->
As
<
ep
::
CudaStream
>
()
->
cuda_stream
())));
}
};
}
// namespace
#define REGISTER_BINARY_CROSS_ENTROPY_REDUCE_MEAN_KERNEL(dtype) \
REGISTER_USER_KERNEL("binary_cross_entropy_with_logits_reduce_mean") \
.SetCreateFn<BinaryCrossEntropyWithLogitsMeanKernel<dtype>>() \
.SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \
&& (user_op::HobDataType("input", 0) == GetDataType<dtype>::value) \
&& (user_op::HobDataType("target", 0) == GetDataType<dtype>::value) \
&& (user_op::HobDataType("out", 0) == GetDataType<dtype>::value)) \
.SetInferTmpSizeFn([](user_op::InferContext* ctx) { \
const int64_t elem_cnt = ctx->InputShape("input", 0).elem_cnt(); \
const int64_t block_num = (elem_cnt + kBlockSize - 1) / kBlockSize; \
int launch_block = block_num; \
using ComputeType = typename DefaultComputeType<dtype>::type; \
OF_CUDA_CHECK(GetNumBlocks( \
FusedBinaryCrossEntropyWithLogitsReduceMeanKernel<dtype, ComputeType, ComputeType>, \
kBlockSize, 0, block_num, 32, &launch_block)); \
const int64_t tmp_buffer_size = GetCudaAlignedSize(launch_block * sizeof(dtype)); \
return tmp_buffer_size; \
});
#define REGISTER_BINARY_CROSS_ENTROPY_REDUCE_MEAN_GRAD_KERNEL(dtype) \
REGISTER_USER_KERNEL("binary_cross_entropy_with_logits_reduce_mean_grad") \
.SetCreateFn<BinaryCrossEntropyWithLogitsReduceMeanGradKernel<dtype>>() \
.SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \
&& (user_op::HobDataType("input", 0) == GetDataType<dtype>::value) \
&& (user_op::HobDataType("target", 0) == GetDataType<dtype>::value) \
&& (user_op::HobDataType("dy", 0) == GetDataType<dtype>::value) \
&& (user_op::HobDataType("dx", 0) == GetDataType<dtype>::value));
REGISTER_BINARY_CROSS_ENTROPY_REDUCE_MEAN_KERNEL
(
half
)
REGISTER_BINARY_CROSS_ENTROPY_REDUCE_MEAN_KERNEL
(
float
)
REGISTER_BINARY_CROSS_ENTROPY_REDUCE_MEAN_KERNEL
(
double
)
REGISTER_BINARY_CROSS_ENTROPY_REDUCE_MEAN_GRAD_KERNEL
(
half
)
REGISTER_BINARY_CROSS_ENTROPY_REDUCE_MEAN_GRAD_KERNEL
(
float
)
REGISTER_BINARY_CROSS_ENTROPY_REDUCE_MEAN_GRAD_KERNEL
(
double
)
}
// namespace user_op
}
// namespace oneflow
\ No newline at end of file
oneflow/user/kernels/broadcast_pow_grad_kernel.hip.cpp
View file @
8f7de847
/*
Copyright 2020 The OneFlow Authors. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
#include "hip/hip_runtime.h"
#include "oneflow/core/kernel/new_kernel_util.h"
#include "oneflow/core/kernel/kernel_util.hip.h"
#include "oneflow/core/framework/framework.h"
#include "oneflow/core/ndarray/ndarray_util.h"
#include "oneflow/core/ndarray/xpu_var_ndarray.h"
#include "oneflow/core/ep/rocm/cuda_stream.h"
namespace
oneflow
{
namespace
{
template
<
typename
T
>
__global__
void
ComputeLogGpu
(
const
int64_t
len
,
T
*
out
,
const
T
*
in
)
{
CUDA_1D_KERNEL_LOOP
(
i
,
len
)
{
out
[
i
]
=
SafeLog
(
in
[
i
]);
}
}
template
<
>
__global__
void
ComputeLogGpu
<
float16
>
(
const
int64_t
len
,
float16
*
out
,
const
float16
*
in
)
{
const
half
*
_in
=
reinterpret_cast
<
const
half
*>
(
in
);
half
*
_out
=
reinterpret_cast
<
half
*>
(
out
);
CUDA_1D_KERNEL_LOOP
(
i
,
len
)
{
_out
[
i
]
=
SafeLog
(
_in
[
i
]);
}
}
template
<
DeviceType
device
,
typename
T
>
class
BroadcastPowYGradKernel
final
:
public
user_op
::
OpKernel
{
public:
BroadcastPowYGradKernel
()
=
default
;
~
BroadcastPowYGradKernel
()
=
default
;
private:
using
user_op
::
OpKernel
::
Compute
;
void
Compute
(
user_op
::
KernelComputeContext
*
ctx
)
const
override
{
const
user_op
::
Tensor
*
x_tensor
=
ctx
->
Tensor4ArgNameAndIndex
(
"x"
,
0
);
const
user_op
::
Tensor
*
z_tensor
=
ctx
->
Tensor4ArgNameAndIndex
(
"z"
,
0
);
const
user_op
::
Tensor
*
dz_tensor
=
ctx
->
Tensor4ArgNameAndIndex
(
"dz"
,
0
);
user_op
::
Tensor
*
tmp_buffer
=
ctx
->
Tensor4ArgNameAndIndex
(
"tmp_buffer"
,
0
);
user_op
::
Tensor
*
dy_tensor
=
ctx
->
Tensor4ArgNameAndIndex
(
"dy"
,
0
);
const
int64_t
num_axes
=
dz_tensor
->
shape_view
().
NumAxes
();
const
int64_t
elem_cnt
=
z_tensor
->
shape_view
().
elem_cnt
();
Memset
<
device
>
(
ctx
->
stream
(),
tmp_buffer
->
mut_dptr
<
T
>
(),
0
,
GetCudaAlignedSize
(
elem_cnt
*
sizeof
(
T
)));
XpuVarNdarray
<
const
T
>
z
(
z_tensor
->
shape_view
(),
z_tensor
->
dptr
<
T
>
(),
num_axes
);
XpuVarNdarray
<
const
T
>
dz
(
dz_tensor
->
shape_view
(),
dz_tensor
->
dptr
<
T
>
(),
num_axes
);
XpuVarNdarray
<
const
T
>
const_tmp
(
dz
.
shape
(),
tmp_buffer
->
dptr
<
T
>
());
XpuVarNdarray
<
T
>
tmp
(
dz
.
shape
(),
tmp_buffer
->
mut_dptr
<
T
>
());
XpuVarNdarray
<
const
T
>
x
(
x_tensor
->
shape_view
(),
x_tensor
->
dptr
<
T
>
(),
num_axes
);
XpuVarNdarray
<
T
>
dy
(
dy_tensor
->
shape_view
(),
dy_tensor
->
mut_dptr
<
T
>
(),
num_axes
);
NdarrayUtil
<
device
,
T
>::
BroadcastAdd
(
ctx
->
stream
(),
tmp
,
x
,
const_tmp
);
ComputeLogGpu
<
T
><<<
BlocksNum4ThreadsNum
(
elem_cnt
),
kCudaThreadsNumPerBlock
,
0
,
ctx
->
stream
()
->
As
<
ep
::
CudaStream
>
()
->
cuda_stream
()
>>>
(
elem_cnt
,
tmp_buffer
->
mut_dptr
<
T
>
(),
tmp_buffer
->
dptr
<
T
>
());
NdarrayUtil
<
device
,
T
>::
BroadcastMul
(
ctx
->
stream
(),
tmp
,
dz
,
const_tmp
);
NdarrayUtil
<
device
,
T
>::
BroadcastMul
(
ctx
->
stream
(),
tmp
,
z
,
const_tmp
);
NdarrayUtil
<
device
,
T
>::
ReduceSum
(
ctx
->
stream
(),
dy
,
const_tmp
,
tmp
);
}
bool
AlwaysComputeWhenAllOutputsEmpty
()
const
override
{
return
false
;
}
};
}
// namespace
#define REGISTER_BROADCAST_POW_Y_GRAD_KERNEL(device, dtype_pair) \
REGISTER_USER_KERNEL
(
"broadcast_pow_y_grad"
)
\
.
SetCreateFn
<
BroadcastPowYGradKernel
<
device
,
OF_PP_PAIR_FIRST
(
dtype_pair
)
>>
()
\
.
SetIsMatchedHob
((
user_op
::
HobDeviceType
()
==
device
)
\
&&
(
user_op
::
HobDataType
(
"x"
,
0
)
==
OF_PP_PAIR_SECOND
(
dtype_pair
)))
\
.
SetInferTmpSizeFn
([](
oneflow
::
user_op
::
InferContext
*
ctx
)
{
\
const
user_op
::
TensorDesc
&
z
=
ctx
->
InputTensorDesc
(
"z"
,
0
);
\
const
DataType
&
data_type
=
z
.
data_type
();
\
const
int64_t
elem_cnt
=
z
.
shape
().
elem_cnt
();
\
return
GetCudaAlignedSize
(
elem_cnt
*
GetSizeOfDataType
(
data_type
));
\
});
OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE
(
REGISTER_BROADCAST_POW_Y_GRAD_KERNEL
,
(
DeviceType
::
kCUDA
),
ARITHMETIC_DATA_TYPE_SEQ
FLOAT16_DATA_TYPE_SEQ
)
/*
Copyright 2020 The OneFlow Authors. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
#include "hip/hip_runtime.h"
#include "oneflow/core/kernel/new_kernel_util.h"
#include "oneflow/core/kernel/kernel_util.hip.h"
#include "oneflow/core/framework/framework.h"
#include "oneflow/core/ndarray/ndarray_util.h"
#include "oneflow/core/ndarray/xpu_var_ndarray.h"
#include "oneflow/core/ep/rocm/cuda_stream.h"
namespace
oneflow
{
namespace
{
template
<
typename
T
>
__global__
void
ComputeLogGpu
(
const
int64_t
len
,
T
*
out
,
const
T
*
in
)
{
CUDA_1D_KERNEL_LOOP
(
i
,
len
)
{
out
[
i
]
=
SafeLog
(
in
[
i
]);
}
}
template
<
>
__global__
void
ComputeLogGpu
<
float16
>
(
const
int64_t
len
,
float16
*
out
,
const
float16
*
in
)
{
const
half
*
_in
=
reinterpret_cast
<
const
half
*>
(
in
);
half
*
_out
=
reinterpret_cast
<
half
*>
(
out
);
CUDA_1D_KERNEL_LOOP
(
i
,
len
)
{
_out
[
i
]
=
SafeLog
(
_in
[
i
]);
}
}
template
<
DeviceType
device
,
typename
T
>
class
BroadcastPowYGradKernel
final
:
public
user_op
::
OpKernel
{
public:
BroadcastPowYGradKernel
()
=
default
;
~
BroadcastPowYGradKernel
()
=
default
;
private:
using
user_op
::
OpKernel
::
Compute
;
void
Compute
(
user_op
::
KernelComputeContext
*
ctx
)
const
override
{
const
user_op
::
Tensor
*
x_tensor
=
ctx
->
Tensor4ArgNameAndIndex
(
"x"
,
0
);
const
user_op
::
Tensor
*
z_tensor
=
ctx
->
Tensor4ArgNameAndIndex
(
"z"
,
0
);
const
user_op
::
Tensor
*
dz_tensor
=
ctx
->
Tensor4ArgNameAndIndex
(
"dz"
,
0
);
user_op
::
Tensor
*
tmp_buffer
=
ctx
->
Tensor4ArgNameAndIndex
(
"tmp_buffer"
,
0
);
user_op
::
Tensor
*
dy_tensor
=
ctx
->
Tensor4ArgNameAndIndex
(
"dy"
,
0
);
const
int64_t
num_axes
=
dz_tensor
->
shape_view
().
NumAxes
();
const
int64_t
elem_cnt
=
z_tensor
->
shape_view
().
elem_cnt
();
Memset
<
device
>
(
ctx
->
stream
(),
tmp_buffer
->
mut_dptr
<
T
>
(),
0
,
GetCudaAlignedSize
(
elem_cnt
*
sizeof
(
T
)));
XpuVarNdarray
<
const
T
>
z
(
z_tensor
->
shape_view
(),
z_tensor
->
dptr
<
T
>
(),
num_axes
);
XpuVarNdarray
<
const
T
>
dz
(
dz_tensor
->
shape_view
(),
dz_tensor
->
dptr
<
T
>
(),
num_axes
);
XpuVarNdarray
<
const
T
>
const_tmp
(
dz
.
shape
(),
tmp_buffer
->
dptr
<
T
>
());
XpuVarNdarray
<
T
>
tmp
(
dz
.
shape
(),
tmp_buffer
->
mut_dptr
<
T
>
());
XpuVarNdarray
<
const
T
>
x
(
x_tensor
->
shape_view
(),
x_tensor
->
dptr
<
T
>
(),
num_axes
);
XpuVarNdarray
<
T
>
dy
(
dy_tensor
->
shape_view
(),
dy_tensor
->
mut_dptr
<
T
>
(),
num_axes
);
NdarrayUtil
<
device
,
T
>::
BroadcastAdd
(
ctx
->
stream
(),
tmp
,
x
,
const_tmp
);
ComputeLogGpu
<
T
><<<
BlocksNum4ThreadsNum
(
elem_cnt
),
kCudaThreadsNumPerBlock
,
0
,
ctx
->
stream
()
->
As
<
ep
::
CudaStream
>
()
->
cuda_stream
()
>>>
(
elem_cnt
,
tmp_buffer
->
mut_dptr
<
T
>
(),
tmp_buffer
->
dptr
<
T
>
());
NdarrayUtil
<
device
,
T
>::
BroadcastMul
(
ctx
->
stream
(),
tmp
,
dz
,
const_tmp
);
NdarrayUtil
<
device
,
T
>::
BroadcastMul
(
ctx
->
stream
(),
tmp
,
z
,
const_tmp
);
NdarrayUtil
<
device
,
T
>::
ReduceSum
(
ctx
->
stream
(),
dy
,
const_tmp
,
tmp
);
}
bool
AlwaysComputeWhenAllOutputsEmpty
()
const
override
{
return
false
;
}
};
}
// namespace
#define REGISTER_BROADCAST_POW_Y_GRAD_KERNEL(device, dtype_pair) \
REGISTER_USER_KERNEL("broadcast_pow_y_grad") \
.SetCreateFn<BroadcastPowYGradKernel<device, OF_PP_PAIR_FIRST(dtype_pair)>>() \
.SetIsMatchedHob((user_op::HobDeviceType() == device) \
&& (user_op::HobDataType("x", 0) == OF_PP_PAIR_SECOND(dtype_pair))) \
.SetInferTmpSizeFn([](oneflow::user_op::InferContext* ctx) { \
const user_op::TensorDesc& z = ctx->InputTensorDesc("z", 0); \
const DataType& data_type = z.data_type(); \
const int64_t elem_cnt = z.shape().elem_cnt(); \
return GetCudaAlignedSize(elem_cnt * GetSizeOfDataType(data_type)); \
});
OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE
(
REGISTER_BROADCAST_POW_Y_GRAD_KERNEL
,
(
DeviceType
::
kCUDA
),
ARITHMETIC_DATA_TYPE_SEQ
FLOAT16_DATA_TYPE_SEQ
)
}
// namespace oneflow
\ No newline at end of file
oneflow/user/kernels/categorical_ordinal_encode_kernel_util.hip.cpp
View file @
8f7de847
#include "hip/hip_runtime.h"
/*
Copyright 2020 The OneFlow Authors. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
#ifdef NDEBUG
#undef NDEBUG
#endif
#include <assert.h>
#include "oneflow/user/kernels/categorical_ordinal_encode_kernel_util.h"
#include "oneflow/core/kernel/kernel_util.hip.h"
#include "oneflow/core/ep/rocm/cuda_stream.h"
namespace
oneflow
{
namespace
{
using
CuInt64T
=
unsigned
long
long
int
;
__device__
__inline__
int32_t
AtomicCAS
(
int32_t
*
address
,
int32_t
compare
,
int32_t
val
)
{
return
atomicCAS
(
address
,
compare
,
val
);
}
__device__
__inline__
int64_t
AtomicCAS
(
int64_t
*
address
,
int64_t
compare
,
int64_t
val
)
{
static_assert
(
sizeof
(
int64_t
)
==
sizeof
(
CuInt64T
),
"size error"
);
return
static_cast
<
int64_t
>
(
atomicCAS
(
reinterpret_cast
<
CuInt64T
*>
(
address
),
static_cast
<
CuInt64T
>
(
compare
),
static_cast
<
CuInt64T
>
(
val
)));
}
__device__
__inline__
int32_t
AtomicAdd
(
int32_t
*
address
,
int32_t
val
)
{
return
atomicAdd
(
address
,
val
);
}
__device__
__inline__
int64_t
AtomicAdd
(
int64_t
*
address
,
int64_t
val
)
{
static_assert
(
sizeof
(
int64_t
)
==
sizeof
(
CuInt64T
),
"size error"
);
return
static_cast
<
int64_t
>
(
atomicAdd
(
reinterpret_cast
<
CuInt64T
*>
(
address
),
static_cast
<
CuInt64T
>
(
val
)));
}
template
<
typename
K
,
typename
V
>
__device__
bool
TryGetOrInsert
(
K
*
key
,
volatile
V
*
value
,
V
*
size
,
const
K
hash
,
V
*
out
)
{
K
old_key
=
AtomicCAS
(
key
,
static_cast
<
K
>
(
0
),
hash
);
if
(
old_key
==
0
)
{
V
v
=
AtomicAdd
(
size
,
1
)
+
1
;
*
value
=
v
;
*
out
=
v
;
return
true
;
}
else
if
(
old_key
==
hash
)
{
while
(
true
)
{
V
v
=
*
value
;
if
(
v
!=
0
)
{
*
out
=
v
;
break
;
}
}
return
true
;
}
else
{
return
false
;
}
}
template
<
typename
T
>
__device__
bool
GetOrInsertOne
(
const
size_t
capacity
,
T
*
table
,
T
*
size
,
const
T
hash
,
T
*
out
)
{
if
(
hash
==
0
)
{
*
out
=
0
;
return
true
;
}
const
size_t
start_idx
=
static_cast
<
size_t
>
(
hash
)
%
capacity
;
// fast path
{
T
*
key
=
table
+
start_idx
*
2
;
T
*
value
=
key
+
1
;
if
(
*
key
==
hash
&&
*
value
!=
0
)
{
*
out
=
*
value
;
return
true
;
}
}
for
(
size_t
count
=
0
;
count
<
capacity
;
++
count
)
{
const
size_t
idx
=
(
start_idx
+
count
)
%
capacity
;
T
*
key
=
table
+
idx
*
2
;
T
*
value
=
key
+
1
;
if
(
TryGetOrInsert
<
T
,
T
>
(
key
,
value
,
size
,
hash
,
out
))
{
return
true
;
}
}
return
false
;
}
template
<
typename
T
>
__global__
void
EncodeGpu
(
const
size_t
capacity
,
T
*
table
,
T
*
size
,
const
int64_t
n
,
const
T
*
hash
,
T
*
out
)
{
CUDA_1D_KERNEL_LOOP
(
i
,
n
)
{
bool
success
=
GetOrInsertOne
<
T
>
(
capacity
,
table
,
size
,
hash
[
i
],
out
+
i
);
assert
(
success
);
}
}
}
// namespace
template
<
typename
T
>
struct
CategoricalOrdinalEncodeKernelUtil
<
DeviceType
::
kCUDA
,
T
>
{
static
void
Encode
(
ep
::
Stream
*
stream
,
int64_t
capacity
,
T
*
table
,
T
*
size
,
int64_t
n
,
const
T
*
hash
,
T
*
out
)
{
EncodeGpu
<
T
>
<<<
BlocksNum4ThreadsNum
(
n
),
kCudaThreadsNumPerBlock
,
0
,
stream
->
As
<
ep
::
CudaStream
>
()
->
cuda_stream
()
>>>
(
capacity
,
table
,
size
,
n
,
hash
,
out
);
}
};
#define INSTANTIATE_CATEGORICAL_ORDINAL_ENCODE_KERNEL_UTIL_CUDA(type_cpp, type_proto) \
template
struct
CategoricalOrdinalEncodeKernelUtil
<
DeviceType
::
kCUDA
,
type_cpp
>;
OF_PP_FOR_EACH_TUPLE
(
INSTANTIATE_CATEGORICAL_ORDINAL_ENCODE_KERNEL_UTIL_CUDA
,
INDEX_DATA_TYPE_SEQ
);
#undef INSTANTIATE_CATEGORICAL_ORDINAL_ENCODE_KERNEL_UTIL_CUDA
#include "hip/hip_runtime.h"
/*
Copyright 2020 The OneFlow Authors. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
#ifdef NDEBUG
#undef NDEBUG
#endif
#include <assert.h>
#include "oneflow/user/kernels/categorical_ordinal_encode_kernel_util.h"
#include "oneflow/core/kernel/kernel_util.hip.h"
#include "oneflow/core/ep/rocm/cuda_stream.h"
namespace
oneflow
{
namespace
{
using
CuInt64T
=
unsigned
long
long
int
;
__device__
__inline__
int32_t
AtomicCAS
(
int32_t
*
address
,
int32_t
compare
,
int32_t
val
)
{
return
atomicCAS
(
address
,
compare
,
val
);
}
__device__
__inline__
int64_t
AtomicCAS
(
int64_t
*
address
,
int64_t
compare
,
int64_t
val
)
{
static_assert
(
sizeof
(
int64_t
)
==
sizeof
(
CuInt64T
),
"size error"
);
return
static_cast
<
int64_t
>
(
atomicCAS
(
reinterpret_cast
<
CuInt64T
*>
(
address
),
static_cast
<
CuInt64T
>
(
compare
),
static_cast
<
CuInt64T
>
(
val
)));
}
__device__
__inline__
int32_t
AtomicAdd
(
int32_t
*
address
,
int32_t
val
)
{
return
atomicAdd
(
address
,
val
);
}
__device__
__inline__
int64_t
AtomicAdd
(
int64_t
*
address
,
int64_t
val
)
{
static_assert
(
sizeof
(
int64_t
)
==
sizeof
(
CuInt64T
),
"size error"
);
return
static_cast
<
int64_t
>
(
atomicAdd
(
reinterpret_cast
<
CuInt64T
*>
(
address
),
static_cast
<
CuInt64T
>
(
val
)));
}
template
<
typename
K
,
typename
V
>
__device__
bool
TryGetOrInsert
(
K
*
key
,
volatile
V
*
value
,
V
*
size
,
const
K
hash
,
V
*
out
)
{
K
old_key
=
AtomicCAS
(
key
,
static_cast
<
K
>
(
0
),
hash
);
if
(
old_key
==
0
)
{
V
v
=
AtomicAdd
(
size
,
1
)
+
1
;
*
value
=
v
;
*
out
=
v
;
return
true
;
}
else
if
(
old_key
==
hash
)
{
while
(
true
)
{
V
v
=
*
value
;
if
(
v
!=
0
)
{
*
out
=
v
;
break
;
}
}
return
true
;
}
else
{
return
false
;
}
}
template
<
typename
T
>
__device__
bool
GetOrInsertOne
(
const
size_t
capacity
,
T
*
table
,
T
*
size
,
const
T
hash
,
T
*
out
)
{
if
(
hash
==
0
)
{
*
out
=
0
;
return
true
;
}
const
size_t
start_idx
=
static_cast
<
size_t
>
(
hash
)
%
capacity
;
// fast path
{
T
*
key
=
table
+
start_idx
*
2
;
T
*
value
=
key
+
1
;
if
(
*
key
==
hash
&&
*
value
!=
0
)
{
*
out
=
*
value
;
return
true
;
}
}
for
(
size_t
count
=
0
;
count
<
capacity
;
++
count
)
{
const
size_t
idx
=
(
start_idx
+
count
)
%
capacity
;
T
*
key
=
table
+
idx
*
2
;
T
*
value
=
key
+
1
;
if
(
TryGetOrInsert
<
T
,
T
>
(
key
,
value
,
size
,
hash
,
out
))
{
return
true
;
}
}
return
false
;
}
template
<
typename
T
>
__global__
void
EncodeGpu
(
const
size_t
capacity
,
T
*
table
,
T
*
size
,
const
int64_t
n
,
const
T
*
hash
,
T
*
out
)
{
CUDA_1D_KERNEL_LOOP
(
i
,
n
)
{
bool
success
=
GetOrInsertOne
<
T
>
(
capacity
,
table
,
size
,
hash
[
i
],
out
+
i
);
assert
(
success
);
}
}
}
// namespace
template
<
typename
T
>
struct
CategoricalOrdinalEncodeKernelUtil
<
DeviceType
::
kCUDA
,
T
>
{
static
void
Encode
(
ep
::
Stream
*
stream
,
int64_t
capacity
,
T
*
table
,
T
*
size
,
int64_t
n
,
const
T
*
hash
,
T
*
out
)
{
EncodeGpu
<
T
>
<<<
BlocksNum4ThreadsNum
(
n
),
kCudaThreadsNumPerBlock
,
0
,
stream
->
As
<
ep
::
CudaStream
>
()
->
cuda_stream
()
>>>
(
capacity
,
table
,
size
,
n
,
hash
,
out
);
}
};
#define INSTANTIATE_CATEGORICAL_ORDINAL_ENCODE_KERNEL_UTIL_CUDA(type_cpp, type_proto) \
template struct CategoricalOrdinalEncodeKernelUtil<DeviceType::kCUDA, type_cpp>;
OF_PP_FOR_EACH_TUPLE
(
INSTANTIATE_CATEGORICAL_ORDINAL_ENCODE_KERNEL_UTIL_CUDA
,
INDEX_DATA_TYPE_SEQ
);
#undef INSTANTIATE_CATEGORICAL_ORDINAL_ENCODE_KERNEL_UTIL_CUDA
}
// namespace oneflow
\ No newline at end of file
Prev
1
2
3
4
5
6
7
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment