Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
95ad9091
Commit
95ad9091
authored
Jul 10, 2022
by
Wenkai
Browse files
Merge branch 'develop' into static_ck_small_gemm
parents
d2640676
1677cf70
Changes
44
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
1609 additions
and
499 deletions
+1609
-499
include/ck/tensor_operation/gpu/device/device_cgemm.hpp
include/ck/tensor_operation/gpu/device/device_cgemm.hpp
+73
-0
include/ck/tensor_operation/gpu/device/device_cgemm_4gemm_xdl_cshuffle.hpp
..._operation/gpu/device/device_cgemm_4gemm_xdl_cshuffle.hpp
+974
-0
include/ck/tensor_operation/gpu/device/device_conv2d_fwd_xdl_c_shuffle_bias_activation_add_nhwc_kyxc_nhwk.hpp
..._fwd_xdl_c_shuffle_bias_activation_add_nhwc_kyxc_nhwk.hpp
+8
-15
include/ck/tensor_operation/gpu/device/device_gemm_dl.hpp
include/ck/tensor_operation/gpu/device/device_gemm_dl.hpp
+2
-2
include/ck/tensor_operation/gpu/device/device_grouped_gemm_xdl.hpp
...k/tensor_operation/gpu/device/device_grouped_gemm_xdl.hpp
+98
-110
include/ck/tensor_operation/gpu/device/device_reduce_multiblock.hpp
.../tensor_operation/gpu/device/device_reduce_multiblock.hpp
+3
-3
include/ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp
...r_operation/gpu/element/binary_element_wise_operation.hpp
+103
-1
include/ck/tensor_operation/gpu/element/element_wise_operation.hpp
...k/tensor_operation/gpu/element/element_wise_operation.hpp
+10
-11
include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_multiblock.hpp
...r_operation/gpu/grid/gridwise_2d_reduction_multiblock.hpp
+8
-8
include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_threadwise.hpp
...r_operation/gpu/grid/gridwise_2d_reduction_threadwise.hpp
+6
-6
include/ck/tensor_operation/gpu/grid/gridwise_gemm_reduce_xdl_cshuffle_v1.hpp
...eration/gpu/grid/gridwise_gemm_reduce_xdl_cshuffle_v1.hpp
+2
-2
include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r3.hpp
...k/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r3.hpp
+1
-1
include/ck/utility/math_v2.hpp
include/ck/utility/math_v2.hpp
+60
-10
include/ck/utility/reduction_functions_accumulate.hpp
include/ck/utility/reduction_functions_accumulate.hpp
+14
-21
include/ck/utility/reduction_operator.hpp
include/ck/utility/reduction_operator.hpp
+7
-10
library/include/ck/library/host_tensor/host_reduce_util.hpp
library/include/ck/library/host_tensor/host_reduce_util.hpp
+0
-257
library/include/ck/library/host_tensor/host_reduction.hpp
library/include/ck/library/host_tensor/host_reduction.hpp
+33
-38
library/include/ck/library/reference_tensor_operation/cpu/reference_cgemm.hpp
...ibrary/reference_tensor_operation/cpu/reference_cgemm.hpp
+203
-0
profiler/include/profile_batched_gemm_reduce_impl.hpp
profiler/include/profile_batched_gemm_reduce_impl.hpp
+2
-2
profiler/include/profile_gemm_reduce_impl.hpp
profiler/include/profile_gemm_reduce_impl.hpp
+2
-2
No files found.
include/ck/tensor_operation/gpu/device/device_cgemm.hpp
0 → 100644
View file @
95ad9091
/*******************************************************************************
*
* MIT License
*
* Copyright (c) 2022 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*******************************************************************************/
#pragma once
#include "device_base.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
template
<
typename
AElementwiseOperation
,
typename
BElementwiseOperation
,
typename
CElementwiseOperation
>
struct
DeviceCGemm
:
public
BaseOperator
{
virtual
std
::
unique_ptr
<
BaseArgument
>
MakeArgumentPointer
(
const
void
*
p_a_real
,
const
void
*
p_a_imag
,
const
void
*
p_b_real
,
const
void
*
p_b_imag
,
void
*
p_c_real
,
void
*
p_c_imag
,
void
*
p_workspace
,
ck
::
index_t
M
,
ck
::
index_t
N
,
ck
::
index_t
K
,
ck
::
index_t
StrideA
,
ck
::
index_t
StrideB
,
ck
::
index_t
StrideC
,
AElementwiseOperation
a_element_op
,
BElementwiseOperation
b_element_op
,
CElementwiseOperation
c_element_op
,
ck
::
index_t
KBatch
=
1
)
=
0
;
virtual
std
::
unique_ptr
<
BaseInvoker
>
MakeInvokerPointer
()
=
0
;
virtual
std
::
size_t
GetWorkspaceSize
(
index_t
MRaw
,
index_t
NRaw
,
index_t
KRaw
,
index_t
StrideA
,
index_t
StrideB
,
index_t
StrideC
)
=
0
;
};
template
<
typename
AElementwiseOperation
,
typename
BElementwiseOperation
,
typename
CElementwiseOperation
>
using
DeviceCGemmPtr
=
std
::
unique_ptr
<
DeviceCGemm
<
AElementwiseOperation
,
BElementwiseOperation
,
CElementwiseOperation
>>
;
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
include/ck/tensor_operation/gpu/device/device_cgemm_4gemm_xdl_cshuffle.hpp
0 → 100644
View file @
95ad9091
This diff is collapsed.
Click to expand it.
include/ck/tensor_operation/gpu/device/device_conv2d_fwd_xdl_c_shuffle_bias_activation_add_nhwc_kyxc_nhwk.hpp
View file @
95ad9091
...
...
@@ -460,6 +460,8 @@ struct
using
C0GridDesc_M_N
=
remove_cvref_t
<
decltype
(
GridDescs
{}[
I3
])
>
;
using
C1GridDesc_M_N
=
remove_cvref_t
<
decltype
(
GridDescs
{}[
I4
])
>
;
using
Block2CTileMap
=
BlockToCTileMap_M00_N0_M01
<
MPerBlock
,
NPerBlock
,
CGridDesc_M_N
>
;
// GridwiseGemm
using
GridwiseGemm
=
GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v3r3
<
BlockSize
,
...
...
@@ -522,8 +524,6 @@ struct
std
::
vector
<
ck
::
index_t
>
conv_filter_dilations
,
std
::
vector
<
ck
::
index_t
>
input_left_pads
,
std
::
vector
<
ck
::
index_t
>
input_right_pads
,
ck
::
index_t
M01
,
ck
::
index_t
N01
,
InElementwiseOperation
in_element_op
,
WeiElementwiseOperation
wei_element_op
,
OutElementwiseOperation
out_element_op
)
...
...
@@ -540,10 +540,7 @@ struct
c_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl_
{},
c0_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl_
{},
c1_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl_
{},
block_2_ctile_map_
{
GridwiseGemm
::
MakeDefaultBlock2CTileMap
(
c_grid_desc_m_n_
,
M01
,
N01
)},
M01_
{
M01
},
N01_
{
N01
},
block_2_ctile_map_
{},
in_element_op_
{
in_element_op
},
wei_element_op_
{
wei_element_op
},
out_element_op_
{
out_element_op
},
...
...
@@ -576,6 +573,8 @@ struct
c0_grid_desc_m_n_
=
descs
[
I3
];
c1_grid_desc_m_n_
=
descs
[
I4
];
block_2_ctile_map_
=
Block2CTileMap
{
c_grid_desc_m_n_
};
if
(
GridwiseGemm
::
CheckValidity
(
a_grid_desc_k0_m_k1_
,
b_grid_desc_k0_n_k1_
,
c_grid_desc_m_n_
,
...
...
@@ -618,9 +617,7 @@ struct
typename
GridwiseGemm
::
C1GridDescriptor_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl
c1_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl_
;
typename
GridwiseGemm
::
DefaultBlock2CTileMap
block_2_ctile_map_
;
index_t
M01_
;
index_t
N01_
;
Block2CTileMap
block_2_ctile_map_
;
InElementwiseOperation
in_element_op_
;
WeiElementwiseOperation
wei_element_op_
;
OutElementwiseOperation
out_element_op_
;
...
...
@@ -723,7 +720,7 @@ struct
InElementwiseOperation
,
WeiElementwiseOperation
,
OutElementwiseOperation
,
remove_reference_t
<
typename
GridwiseGemm
::
Default
Block2CTileMap
>
,
Block2CTileMap
,
true
>
;
ave_time
=
launch_and_time_kernel
(
...
...
@@ -767,7 +764,7 @@ struct
InElementwiseOperation
,
WeiElementwiseOperation
,
OutElementwiseOperation
,
remove_reference_t
<
typename
GridwiseGemm
::
Default
Block2CTileMap
>
,
Block2CTileMap
,
false
>
;
ave_time
=
launch_and_time_kernel
(
...
...
@@ -894,8 +891,6 @@ struct
conv_filter_dilations
,
input_left_pads
,
input_right_pads
,
1
,
1
,
in_element_op
,
wei_element_op
,
out_element_op
};
...
...
@@ -938,8 +933,6 @@ struct
conv_filter_dilations
,
input_left_pads
,
input_right_pads
,
1
,
1
,
in_element_op
,
wei_element_op
,
out_element_op
);
...
...
include/ck/tensor_operation/gpu/device/device_gemm_dl.hpp
View file @
95ad9091
...
...
@@ -60,8 +60,8 @@ template <
index_t
CThreadTransferDstScalarPerVector
,
enable_if_t
<
is_same_v
<
AElementwiseOperation
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
>
&&
is_same_v
<
A
ElementwiseOperation
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
>
&&
is_same_v
<
A
ElementwiseOperation
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
>
,
is_same_v
<
B
ElementwiseOperation
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
>
&&
is_same_v
<
C
ElementwiseOperation
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
>
,
bool
>
=
false
>
struct
DeviceGemmDl
:
public
DeviceGemm
<
AElementwiseOperation
,
BElementwiseOperation
,
CElementwiseOperation
>
...
...
include/ck/tensor_operation/gpu/device/device_grouped_gemm_xdl.hpp
View file @
95ad9091
...
...
@@ -24,57 +24,33 @@ template <typename GridwiseGemm,
typename
AElementwiseOperation
,
typename
BElementwiseOperation
,
typename
CElementwiseOperation
,
bool
HasMainKBlockLoop
,
index_t
MaxGroupCount
>
bool
HasMainKBlockLoop
>
__global__
void
#if CK_USE_LAUNCH_BOUNDS
__launch_bounds__
(
CK_MAX_THREAD_PER_BLOCK
,
CK_MIN_BLOCK_PER_CU
)
#endif
kernel_grouped_gemm_xdlops_v2r3
(
const
StaticallyIndexedArray
<
GemmDesc
,
MaxGroupCount
>
gemm_descs
,
const
index_t
group_count
,
const
AElementwiseOperation
a_element_op
,
const
BElementwiseOperation
b_element_op
,
const
CElementwiseOperation
c_element_op
)
kernel_grouped_gemm_xdlops_v2r3
(
const
void
CK_CONSTANT_ADDRESS_SPACE
*
gemm_descs_const
,
const
index_t
group_count
,
const
AElementwiseOperation
a_element_op
,
const
BElementwiseOperation
b_element_op
,
const
CElementwiseOperation
c_element_op
)
{
#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__))
__shared__
char
p_shared
[
GridwiseGemm
::
GetSharedMemoryNumberOfByte
()];
const
index_t
block_id
=
get_block_1d_id
();
#if 1
static_for
<
0
,
MaxGroupCount
,
1
>
{}([
&
](
auto
i
)
{
if
(
block_id
>=
gemm_descs
[
i
].
BlockStart_
&&
block_id
<
gemm_descs
[
i
].
BlockEnd_
&&
i
<
group_count
)
{
auto
group_id
=
i
;
GridwiseGemm
::
template
Run
<
HasMainKBlockLoop
>(
gemm_descs
[
group_id
].
a_ptr
,
gemm_descs
[
group_id
].
b_ptr
,
gemm_descs
[
group_id
].
c_ptr
,
p_shared
,
gemm_descs
[
group_id
].
a_grid_desc_k0_m_k1_
,
gemm_descs
[
group_id
].
b_grid_desc_k0_n_k1_
,
gemm_descs
[
group_id
].
c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2_
,
a_element_op
,
b_element_op
,
c_element_op
,
gemm_descs
[
group_id
].
grouped_gemm_block_2_ctile_map_
);
}
});
#else
const
auto
gemm_desc_ptr
=
reinterpret_cast
<
const
GemmDesc
*>
(
&
gemm_descs
);
const
auto
gemm_desc_ptr
=
reinterpret_cast
<
const
GemmDesc
*>
(
cast_pointer_to_generic_address_space
(
gemm_descs_const
));
index_t
group_id
=
0
;
static_for
<
0
,
MaxGroupCount
,
1
>
{}([
&
](
auto
i
)
{
group_id
=
(
block_id
>=
gemm_descs
[
i
].
BlockStart
&&
block_id
<
gemm_descs
[
i
].
BlockEnd
&&
i
<
group_count
)
?
i
:
group_id
;
});
const
index_t
block_id_grp
=
block_id
-
gemm_desc_ptr
[
group_id
].
BlockStart
;
for
(
index_t
i
=
0
;
i
<
group_count
;
i
++
)
{
group_id
=
(
block_id
>=
gemm_desc_ptr
[
i
].
BlockStart_
&&
block_id
<
gemm_desc_ptr
[
i
].
BlockEnd_
)
?
i
:
group_id
;
}
GridwiseGemm
::
template
Run
<
HasMainKBlockLoop
>(
gemm_desc_ptr
[
group_id
].
a_ptr
,
...
...
@@ -87,11 +63,9 @@ __global__ void
a_element_op
,
b_element_op
,
c_element_op
,
gemm_desc_ptr
[
group_id
].
block_2_ctile_map_
,
block_id_grp
);
#endif
gemm_desc_ptr
[
group_id
].
grouped_gemm_block_2_ctile_map_
);
#else
ignore
=
gemm_descs
;
ignore
=
gemm_descs
_const
;
ignore
=
group_count
;
ignore
=
a_element_op
;
ignore
=
b_element_op
;
...
...
@@ -388,6 +362,8 @@ struct DeviceGroupedGemmXdl
{
grid_size_
=
0
;
gemm_descs_args_workspace_
=
nullptr
;
group_count_
=
ck
::
type_convert
<
ck
::
index_t
>
(
gemm_shapes
.
size
());
if
(
!
(
group_count_
==
ck
::
type_convert
<
ck
::
index_t
>
(
p_a
.
size
())
&&
...
...
@@ -461,6 +437,8 @@ struct DeviceGroupedGemmXdl
std
::
vector
<
GemmDescKernelArg
>
gemm_desc_kernel_arg_
;
void
*
gemm_descs_args_workspace_
;
index_t
grid_size_
;
};
...
...
@@ -471,49 +449,49 @@ struct DeviceGroupedGemmXdl
float
Run
(
const
Argument
&
arg
,
const
StreamConfig
&
stream_config
=
StreamConfig
{})
{
StaticallyIndexedArray
<
GemmDescKernelArg
,
MaxGroupCount
>
gemm_desc_kernel_args
;
bool
has_main_k_block_loop
=
true
;
static_for
<
0
,
MaxGroupCount
,
1
>
{}([
&
](
auto
i
)
{
if
(
i
<
arg
.
gemm_desc_kernel_arg_
.
size
())
for
(
std
::
size_t
i
=
0
;
i
<
arg
.
gemm_desc_kernel_arg_
.
size
();
i
++
)
{
std
::
cout
<<
"group: "
<<
i
<<
" arg.a_grid_desc_k0_m_k1_{"
<<
arg
.
gemm_desc_kernel_arg_
[
i
].
a_grid_desc_k0_m_k1_
.
GetLength
(
I0
)
<<
", "
<<
arg
.
gemm_desc_kernel_arg_
[
i
].
a_grid_desc_k0_m_k1_
.
GetLength
(
I1
)
<<
", "
<<
arg
.
gemm_desc_kernel_arg_
[
i
].
a_grid_desc_k0_m_k1_
.
GetLength
(
I2
)
<<
"}"
;
std
::
cout
<<
", arg.b_grid_desc_k0_n_k1_{"
<<
arg
.
gemm_desc_kernel_arg_
[
i
].
b_grid_desc_k0_n_k1_
.
GetLength
(
I0
)
<<
", "
<<
arg
.
gemm_desc_kernel_arg_
[
i
].
b_grid_desc_k0_n_k1_
.
GetLength
(
I1
)
<<
", "
<<
arg
.
gemm_desc_kernel_arg_
[
i
].
b_grid_desc_k0_n_k1_
.
GetLength
(
I2
)
<<
"}"
;
std
::
cout
<<
", arg.c_grid_desc_m_n_{ "
<<
arg
.
gemm_desc_kernel_arg_
[
i
].
c_grid_desc_m_n_
.
GetLength
(
I0
)
<<
", "
<<
arg
.
gemm_desc_kernel_arg_
[
i
].
c_grid_desc_m_n_
.
GetLength
(
I1
)
<<
"}"
<<
std
::
endl
;
if
(
!
GridwiseGemm
::
CheckValidity
(
arg
.
gemm_desc_kernel_arg_
[
i
].
a_grid_desc_k0_m_k1_
,
arg
.
gemm_desc_kernel_arg_
[
i
].
b_grid_desc_k0_n_k1_
,
arg
.
gemm_desc_kernel_arg_
[
i
].
c_grid_desc_m_n_
,
arg
.
gemm_desc_kernel_arg_
[
i
].
grouped_gemm_block_2_ctile_map_
))
{
gemm_desc_kernel_args
(
i
)
=
arg
.
gemm_desc_kernel_arg_
[
i
];
std
::
cout
<<
"group: "
<<
i
<<
" arg.a_grid_desc_k0_m_k1_{"
<<
gemm_desc_kernel_args
[
i
].
a_grid_desc_k0_m_k1_
.
GetLength
(
I0
)
<<
", "
<<
gemm_desc_kernel_args
[
i
].
a_grid_desc_k0_m_k1_
.
GetLength
(
I1
)
<<
", "
<<
gemm_desc_kernel_args
[
i
].
a_grid_desc_k0_m_k1_
.
GetLength
(
I2
)
<<
"}"
;
std
::
cout
<<
", arg.b_grid_desc_k0_n_k1_{"
<<
gemm_desc_kernel_args
[
i
].
b_grid_desc_k0_n_k1_
.
GetLength
(
I0
)
<<
", "
<<
gemm_desc_kernel_args
[
i
].
b_grid_desc_k0_n_k1_
.
GetLength
(
I1
)
<<
", "
<<
gemm_desc_kernel_args
[
i
].
b_grid_desc_k0_n_k1_
.
GetLength
(
I2
)
<<
"}"
;
std
::
cout
<<
", arg.c_grid_desc_m_n_{ "
<<
gemm_desc_kernel_args
[
i
].
c_grid_desc_m_n_
.
GetLength
(
I0
)
<<
", "
<<
gemm_desc_kernel_args
[
i
].
c_grid_desc_m_n_
.
GetLength
(
I1
)
<<
"}"
<<
std
::
endl
;
if
(
!
GridwiseGemm
::
CheckValidity
(
gemm_desc_kernel_args
[
i
].
a_grid_desc_k0_m_k1_
,
gemm_desc_kernel_args
[
i
].
b_grid_desc_k0_n_k1_
,
gemm_desc_kernel_args
[
i
].
c_grid_desc_m_n_
,
gemm_desc_kernel_args
[
i
].
grouped_gemm_block_2_ctile_map_
))
{
throw
std
::
runtime_error
(
"wrong! GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3 has invalid setting"
);
}
const
auto
K
=
gemm_desc_kernel_args
[
i
].
a_grid_desc_k0_m_k1_
.
GetLength
(
I0
)
*
gemm_desc_kernel_args
[
i
].
a_grid_desc_k0_m_k1_
.
GetLength
(
I2
);
if
(
GridwiseGemm
::
CalculateHasMainKBlockLoop
(
K
)
!=
has_main_k_block_loop
)
{
throw
std
::
runtime_error
(
"wrong! not all gemm has_main_k_block_loop"
);
}
throw
std
::
runtime_error
(
"wrong! GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3 has invalid setting"
);
}
});
const
auto
K
=
arg
.
gemm_desc_kernel_arg_
[
i
].
a_grid_desc_k0_m_k1_
.
GetLength
(
I0
)
*
arg
.
gemm_desc_kernel_arg_
[
i
].
a_grid_desc_k0_m_k1_
.
GetLength
(
I2
);
if
(
GridwiseGemm
::
CalculateHasMainKBlockLoop
(
K
)
!=
has_main_k_block_loop
)
{
throw
std
::
runtime_error
(
"wrong! not all gemm has_main_k_block_loop"
);
}
}
hipGetErrorString
(
hipMemcpy
(
arg
.
gemm_descs_args_workspace_
,
arg
.
gemm_desc_kernel_arg_
.
data
(),
arg
.
gemm_desc_kernel_arg_
.
size
()
*
sizeof
(
GemmDescKernelArg
),
hipMemcpyHostToDevice
));
float
ave_time
=
0
;
...
...
@@ -523,23 +501,23 @@ struct DeviceGroupedGemmXdl
kernel_grouped_gemm_xdlops_v2r3
<
GridwiseGemm
,
ADataType
,
// TODO: distiguish A/B datatype
CDataType
,
remove_reference_t
<
GemmDescKernelArg
>
,
GemmDescKernelArg
,
AElementwiseOperation
,
BElementwiseOperation
,
CElementwiseOperation
,
true
,
MaxGroupCount
>
;
ave_time
=
launch_and_time_kernel
(
stream_config
,
kernel
,
dim3
(
arg
.
grid_size_
),
dim3
(
BlockSize
),
0
,
gemm_desc_kernel_args
,
arg
.
gemm_desc_kernel_arg_
.
size
(),
arg
.
a_element_op_
,
arg
.
b_element_op_
,
arg
.
c_element_op_
);
true
>
;
ave_time
=
launch_and_time_kernel
(
stream_config
,
kernel
,
dim3
(
arg
.
grid_size_
),
dim3
(
BlockSize
),
0
,
cast_pointer_to_constant_address_space
(
arg
.
gemm_descs_args_workspace_
)
,
arg
.
gemm_desc_kernel_arg_
.
size
(),
arg
.
a_element_op_
,
arg
.
b_element_op_
,
arg
.
c_element_op_
);
}
else
{
...
...
@@ -547,23 +525,23 @@ struct DeviceGroupedGemmXdl
kernel_grouped_gemm_xdlops_v2r3
<
GridwiseGemm
,
ADataType
,
// TODO: distiguish A/B datatype
CDataType
,
remove_reference_t
<
GemmDescKernelArg
>
,
GemmDescKernelArg
,
AElementwiseOperation
,
BElementwiseOperation
,
CElementwiseOperation
,
false
,
MaxGroupCount
>
;
ave_time
=
launch_and_time_kernel
(
stream_config
,
kernel
,
dim3
(
arg
.
grid_size_
),
dim3
(
BlockSize
),
0
,
gemm_desc_kernel_args
,
arg
.
gemm_desc_kernel_arg_
.
size
(),
arg
.
a_element_op_
,
arg
.
b_element_op_
,
arg
.
c_element_op_
);
false
>
;
ave_time
=
launch_and_time_kernel
(
stream_config
,
kernel
,
dim3
(
arg
.
grid_size_
),
dim3
(
BlockSize
),
0
,
cast_pointer_to_constant_address_space
(
arg
.
gemm_descs_args_workspace_
)
,
arg
.
gemm_desc_kernel_arg_
.
size
(),
arg
.
a_element_op_
,
arg
.
b_element_op_
,
arg
.
c_element_op_
);
}
return
ave_time
;
...
...
@@ -652,6 +630,16 @@ struct DeviceGroupedGemmXdl
return
str
.
str
();
}
size_t
GetWorkSpaceSize
(
const
BaseArgument
*
p_arg
)
const
override
{
return
dynamic_cast
<
const
Argument
*>
(
p_arg
)
->
group_count_
*
sizeof
(
GemmDescKernelArg
);
}
void
SetWorkSpacePointer
(
BaseArgument
*
p_arg
,
void
*
workspace_ptr
)
const
override
{
dynamic_cast
<
Argument
*>
(
p_arg
)
->
gemm_descs_args_workspace_
=
workspace_ptr
;
}
};
}
// namespace device
...
...
include/ck/tensor_operation/gpu/device/device_reduce_multiblock.hpp
View file @
95ad9091
...
...
@@ -348,8 +348,8 @@ struct DeviceReduceMultiBlock : public DeviceReduce<InElementwiseOperation, AccE
if
constexpr
(
use_multiblock
)
{
const
auto
zero
Val
=
ck
::
reduce
::
Get
ReductionZero
ValueForInMemoryDataOperation
<
OutDataType
>
(
const
auto
identity
Val
=
ck
::
reduce
::
Get
Identity
Value
ue
ForInMemoryDataOperation
<
OutDataType
>
(
OutMemoryDataOperation
);
const
auto
kernel_pre
=
...
...
@@ -362,7 +362,7 @@ struct DeviceReduceMultiBlock : public DeviceReduce<InElementwiseOperation, AccE
0
,
out_grid_desc_m_2
,
arg
.
out_dev_
,
zero
Val
);
identity
Val
);
};
avg_time
+=
launch_and_time_kernel
(
stream_config
,
...
...
include/ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp
View file @
95ad9091
/*******************************************************************************
*
* MIT License
*
* Copyright (c) 2022 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*******************************************************************************/
#pragma once
#include "data_type.hpp"
...
...
@@ -5,14 +30,22 @@ namespace ck {
namespace
tensor_operation
{
namespace
binary_element_wise
{
struct
Add
template
<
typename
Y
,
typename
X1
,
typename
X2
>
struct
Add
;
template
<
>
struct
Add
<
double
,
double
,
double
>
{
__host__
__device__
constexpr
void
operator
()(
double
&
dst
,
const
double
&
src1
,
const
double
&
src2
)
const
{
dst
=
src1
+
src2
;
}
};
template
<
>
struct
Add
<
float
,
float
,
float
>
{
__host__
__device__
constexpr
void
operator
()(
float
&
dst
,
const
float
&
src1
,
const
float
&
src2
)
const
{
...
...
@@ -20,6 +53,75 @@ struct Add
}
};
template
<
>
struct
Add
<
half_t
,
half_t
,
half_t
>
{
__host__
__device__
constexpr
void
operator
()(
half_t
&
dst
,
const
half_t
&
src1
,
const
half_t
&
src2
)
const
{
dst
=
src1
+
src2
;
}
};
template
<
>
struct
Add
<
bhalf_t
,
bhalf_t
,
bhalf_t
>
{
__host__
__device__
constexpr
void
operator
()(
bhalf_t
&
dst
,
const
bhalf_t
&
src1
,
const
bhalf_t
&
src2
)
const
{
const
float
x1
=
ck
::
type_convert
<
float
>
(
src1
);
const
float
x2
=
ck
::
type_convert
<
float
>
(
src2
);
const
float
y
=
x1
+
x2
;
dst
=
ck
::
type_convert
<
bhalf_t
>
(
y
);
}
};
template
<
typename
Y
,
typename
X1
,
typename
X2
>
struct
Substract
;
template
<
>
struct
Substract
<
double
,
double
,
double
>
{
__host__
__device__
constexpr
void
operator
()(
double
&
dst
,
const
double
&
src1
,
const
double
&
src2
)
const
{
dst
=
src1
-
src2
;
}
};
template
<
>
struct
Substract
<
float
,
float
,
float
>
{
__host__
__device__
constexpr
void
operator
()(
float
&
dst
,
const
float
&
src1
,
const
float
&
src2
)
const
{
dst
=
src1
-
src2
;
}
};
template
<
>
struct
Substract
<
half_t
,
half_t
,
half_t
>
{
__host__
__device__
constexpr
void
operator
()(
half_t
&
dst
,
const
half_t
&
src1
,
const
half_t
&
src2
)
const
{
dst
=
src1
-
src2
;
}
};
template
<
>
struct
Substract
<
bhalf_t
,
bhalf_t
,
bhalf_t
>
{
__host__
__device__
constexpr
void
operator
()(
bhalf_t
&
dst
,
const
bhalf_t
&
src1
,
const
bhalf_t
&
src2
)
const
{
const
float
x1
=
ck
::
type_convert
<
float
>
(
src1
);
const
float
x2
=
ck
::
type_convert
<
float
>
(
src2
);
const
float
y
=
x1
-
x2
;
dst
=
ck
::
type_convert
<
bhalf_t
>
(
y
);
}
};
}
// namespace binary_element_wise
}
// namespace tensor_operation
}
// namespace ck
include/ck/tensor_operation/gpu/element/element_wise_operation.hpp
View file @
95ad9091
#pragma once
#include "data_type.hpp"
#include "math_v2.hpp"
namespace
ck
{
namespace
tensor_operation
{
...
...
@@ -296,7 +297,7 @@ struct UnaryAbs<float, float>
{
__host__
__device__
UnaryAbs
(
const
int32_t
divider
=
1
)
{
(
void
)
divider
;
};
__host__
__device__
void
operator
()(
float
&
y
,
const
float
&
x
)
const
{
y
=
abs
(
x
);
};
__host__
__device__
void
operator
()(
float
&
y
,
const
float
&
x
)
const
{
y
=
ck
::
math
::
abs
(
x
);
};
};
template
<
>
...
...
@@ -304,7 +305,7 @@ struct UnaryAbs<half_t, half_t>
{
__host__
__device__
UnaryAbs
(
const
int32_t
divider
=
1
)
{
(
void
)
divider
;
};
__host__
__device__
void
operator
()(
half_t
&
y
,
const
half_t
&
x
)
const
{
y
=
__h
abs
(
x
);
};
__host__
__device__
void
operator
()(
half_t
&
y
,
const
half_t
&
x
)
const
{
y
=
ck
::
math
::
abs
(
x
);
};
};
template
<
>
...
...
@@ -312,7 +313,7 @@ struct UnaryAbs<double, double>
{
__host__
__device__
UnaryAbs
(
const
int32_t
divider
=
1
)
{
(
void
)
divider
;
};
__host__
__device__
void
operator
()(
double
&
y
,
const
double
&
x
)
const
{
y
=
abs
(
x
);
};
__host__
__device__
void
operator
()(
double
&
y
,
const
double
&
x
)
const
{
y
=
ck
::
math
::
abs
(
x
);
};
};
template
<
>
...
...
@@ -320,12 +321,7 @@ struct UnaryAbs<int8_t, int8_t>
{
__host__
__device__
UnaryAbs
(
const
int32_t
divider
=
1
)
{
(
void
)
divider
;
};
__host__
__device__
void
operator
()(
int8_t
&
y
,
const
int8_t
&
x
)
const
{
int8_t
sgn
=
x
>>
(
8
-
1
);
y
=
(
x
^
sgn
)
-
sgn
;
};
__host__
__device__
void
operator
()(
int8_t
&
y
,
const
int8_t
&
x
)
const
{
y
=
ck
::
math
::
abs
(
x
);
};
};
template
<
typename
Y
,
typename
X
>
...
...
@@ -336,7 +332,7 @@ struct UnarySqrt<float, float>
{
__host__
__device__
UnarySqrt
(
const
int32_t
divider
=
1
)
{
(
void
)
divider
;
};
__host__
__device__
void
operator
()(
float
&
y
,
const
float
&
x
)
const
{
y
=
sqrt
f
(
x
);
};
__host__
__device__
void
operator
()(
float
&
y
,
const
float
&
x
)
const
{
y
=
ck
::
math
::
sqrt
(
x
);
};
};
template
<
>
...
...
@@ -344,7 +340,10 @@ struct UnarySqrt<double, double>
{
__host__
__device__
UnarySqrt
(
const
int32_t
divider
=
1
)
{
(
void
)
divider
;
};
__host__
__device__
void
operator
()(
double
&
y
,
const
double
&
x
)
const
{
y
=
sqrt
(
x
);
};
__host__
__device__
void
operator
()(
double
&
y
,
const
double
&
x
)
const
{
y
=
ck
::
math
::
sqrt
(
x
);
};
};
}
// namespace element_wise
...
...
include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_multiblock.hpp
View file @
95ad9091
...
...
@@ -171,7 +171,7 @@ struct GridwiseReduction_mk_to_m_multiblock
AccDataType
beta
,
OutDataType
*
const
__restrict__
p_out_value_global
)
{
const
auto
zero
Val
=
ReduceOperation
::
Get
ReductionZero
Val
();
const
auto
identity
Val
=
ReduceOperation
::
Get
Identity
Val
ue
();
// LDS
__shared__
AccDataType
p_reduce_work_buffer
[
BlockSize
];
...
...
@@ -179,7 +179,7 @@ struct GridwiseReduction_mk_to_m_multiblock
const
auto
in_global_val_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
::
Global
>
(
p_in_value_global
,
in_grid_desc_m_k
.
GetElementSpaceSize
(),
type_convert
<
InDataType
>
(
zero
Val
));
type_convert
<
InDataType
>
(
identity
Val
));
auto
out_global_val_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
::
Global
>
(
p_out_value_global
,
out_grid_desc_m
.
GetElementSpaceSize
());
...
...
@@ -191,7 +191,7 @@ struct GridwiseReduction_mk_to_m_multiblock
StaticBuffer
<
AddressSpaceEnum
::
Vgpr
,
AccDataType
,
MThreadSliceSize
,
true
>
accu_value_buf
;
static_for
<
0
,
MThreadSliceSize
,
1
>
{}([
&
](
auto
I
)
{
accu_value_buf
(
I
)
=
zero
Val
;
});
static_for
<
0
,
MThreadSliceSize
,
1
>
{}([
&
](
auto
I
)
{
accu_value_buf
(
I
)
=
identity
Val
;
});
const
index_t
thread_local_id
=
get_thread_local_1d_id
();
const
index_t
block_global_id
=
get_block_1d_id
();
...
...
@@ -358,12 +358,12 @@ struct GridwiseReduction_mk_to_m_multiblock
__shared__
AccDataType
p_reduce_work_val_buffer
[
BlockSize
];
__shared__
IndexDataType
p_reduce_work_idx_buffer
[
BlockSize
];
const
auto
zero
Val
=
ReduceOperation
::
Get
ReductionZero
Val
();
const
auto
identity
Val
=
ReduceOperation
::
Get
Identity
Val
ue
();
const
auto
in_global_val_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
::
Global
>
(
p_in_value_global
,
in_grid_desc_m_k
.
GetElementSpaceSize
(),
type_convert
<
InDataType
>
(
zero
Val
));
type_convert
<
InDataType
>
(
identity
Val
));
const
auto
in_global_idx_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
::
Global
>
(
p_in_index_global
,
in_grid_desc_m_k
.
GetElementSpaceSize
());
auto
out_global_val_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
::
Global
>
(
...
...
@@ -418,7 +418,7 @@ struct GridwiseReduction_mk_to_m_multiblock
thread_k_cluster_id
*
KThreadSliceSize
));
static_for
<
0
,
MThreadSliceSize
,
1
>
{}([
&
](
auto
I
)
{
accu_value_buf
(
I
)
=
zero
Val
;
accu_value_buf
(
I
)
=
identity
Val
;
accu_index_buf
(
I
)
=
0
;
});
...
...
@@ -459,7 +459,7 @@ struct GridwiseReduction_mk_to_m_multiblock
in_thread_idx_buf
);
static_for
<
0
,
MThreadSliceSize
,
1
>
{}([
&
](
auto
iM
)
{
AccDataType
tmpValue
=
zero
Val
;
AccDataType
tmpValue
=
identity
Val
;
IndexDataType
tmpIndex
=
0
;
static_for
<
0
,
KThreadSliceSize
,
1
>
{}([
&
](
auto
iK
)
{
...
...
@@ -512,7 +512,7 @@ struct GridwiseReduction_mk_to_m_multiblock
in_thread_val_buf
(
Number
<
offset
>
{}));
});
AccDataType
tmpValue
=
zero
Val
;
AccDataType
tmpValue
=
identity
Val
;
IndexDataType
tmpIndex
=
0
;
static_for
<
0
,
KThreadSliceSize
,
1
>
{}([
&
](
auto
iK
)
{
...
...
include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_threadwise.hpp
View file @
95ad9091
...
...
@@ -135,12 +135,12 @@ struct GridwiseReduction_mk_to_m_threadwise
ReduceOperation
,
PropagateNan
>
;
const
auto
zero
Val
=
ReduceOperation
::
Get
ReductionZero
Val
();
const
auto
identity
Val
=
ReduceOperation
::
Get
Identity
Val
ue
();
const
auto
in_global_val_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
::
Global
>
(
p_in_value_global
,
in_grid_desc_m_k
.
GetElementSpaceSize
(),
type_convert
<
InDataType
>
(
zero
Val
));
type_convert
<
InDataType
>
(
identity
Val
));
auto
dst_global_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
::
Global
>
(
p_out_value_global
,
out_grid_desc_m
.
GetElementSpaceSize
());
...
...
@@ -149,7 +149,7 @@ struct GridwiseReduction_mk_to_m_threadwise
StaticBuffer
<
AddressSpaceEnum
::
Vgpr
,
AccDataType
,
MThreadSliceSize
,
true
>
accu_value_buf
;
static_for
<
0
,
MThreadSliceSize
,
1
>
{}([
&
](
auto
I
)
{
accu_value_buf
(
I
)
=
zero
Val
;
});
static_for
<
0
,
MThreadSliceSize
,
1
>
{}([
&
](
auto
I
)
{
accu_value_buf
(
I
)
=
identity
Val
;
});
const
auto
toReduceLength
=
in_grid_desc_m_k
.
GetLength
(
Number
<
1
>
{});
...
...
@@ -276,12 +276,12 @@ struct GridwiseReduction_mk_to_m_threadwise
(
void
)
acc_elementwise_op
;
const
auto
zero
Val
=
ReduceOperation
::
Get
ReductionZero
Val
();
const
auto
identity
Val
=
ReduceOperation
::
Get
Identity
Val
ue
();
const
auto
in_global_val_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
::
Global
>
(
p_in_value_global
,
in_grid_desc_m_k
.
GetElementSpaceSize
(),
type_convert
<
InDataType
>
(
zero
Val
));
type_convert
<
InDataType
>
(
identity
Val
));
const
auto
in_global_idx_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
::
Global
>
(
p_in_index_global
,
in_grid_desc_m_k
.
GetElementSpaceSize
());
...
...
@@ -303,7 +303,7 @@ struct GridwiseReduction_mk_to_m_threadwise
StaticBuffer
<
AddressSpaceEnum
::
Vgpr
,
IndexDataType
,
MThreadSliceSize
,
true
>
accu_index_buf
;
static_for
<
0
,
MThreadSliceSize
,
1
>
{}([
&
](
auto
I
)
{
accu_value_buf
(
I
)
=
zero
Val
;
accu_value_buf
(
I
)
=
identity
Val
;
accu_index_buf
(
I
)
=
0
;
});
...
...
include/ck/tensor_operation/gpu/grid/gridwise_gemm_reduce_xdl_cshuffle_v1.hpp
View file @
95ad9091
...
...
@@ -816,10 +816,10 @@ struct GridwiseGemmReduce_k0mk1_k0nk1_mn_xdl_cshuffle_v1
false
>
;
// Global write Gemm shuffle + reduction
const
auto
d_
zero
Val
=
DReduceOperation
::
Get
ReductionZero
Val
();
const
auto
d_
identity
Val
=
DReduceOperation
::
Get
Identity
Val
ue
();
static_for
<
0
,
mreduce_per_thread
,
1
>
{}(
[
&
](
auto
I
)
{
d_thread_buf
(
I
)
=
d_
zero
Val
;
});
[
&
](
auto
I
)
{
d_thread_buf
(
I
)
=
d_
identity
Val
;
});
// reduce in VGPR
static_for
<
0
,
mreduce_per_thread
,
1
>
{}([
&
](
auto
im
)
{
...
...
include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r3.hpp
View file @
95ad9091
...
...
@@ -340,7 +340,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v3r3
using
DefaultBlock2CTileMap
=
remove_cvref_t
<
decltype
(
MakeDefaultBlock2CTileMap
(
CGridDesc_M_N
{},
1
,
1
))
>
;
template
<
bool
HasMainKBlockLoop
,
typename
Block2CTileMap
=
DefaultBlock2CTileMap
>
template
<
bool
HasMainKBlockLoop
,
typename
Block2CTileMap
>
__device__
static
void
Run
(
const
FloatAB
*
__restrict__
p_a_grid
,
const
FloatAB
*
__restrict__
p_b_grid
,
...
...
include/ck/utility/math_v2.hpp
View file @
95ad9091
...
...
@@ -3,11 +3,13 @@
#include <cmath>
#include "data_type.hpp"
#include "
half
.hpp"
#include "
type
.hpp"
namespace
ck
{
namespace
math
{
// math functions for the host, some are implemented by calling C++ std functions
static
inline
__host__
float
abs
(
float
x
)
{
return
std
::
abs
(
x
);
};
static
inline
__host__
double
abs
(
double
x
)
{
return
std
::
abs
(
x
);
};
...
...
@@ -28,26 +30,26 @@ static inline __host__ int32_t abs(int32_t x)
static
inline
__host__
half_t
abs
(
half_t
x
)
{
half_float
::
half
xx
=
*
reinterpret_cast
<
half_float
::
half
*
>
(
&
x
);
uint16_t
xx
=
ck
::
bit_cast
<
uint16_t
>
(
x
);
half_float
::
half
abs_xx
=
half_float
::
abs
(
xx
)
;
uint16_t
abs_xx
=
xx
&
0x7fff
;
half_t
abs_x
=
*
reinterpre
t_cast
<
half_t
*
>
(
&
abs_xx
);
half_t
abs_x
=
ck
::
bi
t_cast
<
half_t
>
(
abs_xx
);
return
abs_x
;
};
static
inline
__host__
float
isnan
(
float
x
)
{
return
std
::
isnan
(
x
);
};
static
inline
__host__
bool
isnan
(
float
x
)
{
return
std
::
isnan
(
x
);
};
static
inline
__host__
double
isnan
(
double
x
)
{
return
std
::
isnan
(
x
);
};
static
inline
__host__
bool
isnan
(
double
x
)
{
return
std
::
isnan
(
x
);
};
static
inline
__host__
int8_t
isnan
(
int8_t
x
)
static
inline
__host__
bool
isnan
(
int8_t
x
)
{
(
void
)
x
;
return
false
;
};
static
inline
__host__
int32_t
isnan
(
int32_t
x
)
static
inline
__host__
bool
isnan
(
int32_t
x
)
{
(
void
)
x
;
return
false
;
...
...
@@ -55,11 +57,59 @@ static inline __host__ int32_t isnan(int32_t x)
static
inline
__host__
bool
isnan
(
half_t
x
)
{
half_float
::
half
xx
=
*
reinterpret_cast
<
half_float
::
half
*>
(
&
x
);
uint16_t
xx
=
ck
::
bit_cast
<
uint16_t
>
(
x
);
return
(
xx
&
0x7FFF
)
>
0x7C00
;
};
static
inline
__host__
float
sqrt
(
float
x
)
{
return
std
::
sqrt
(
x
);
};
static
inline
__host__
double
sqrt
(
double
x
)
{
return
std
::
sqrt
(
x
);
};
// math functions for the HIP kernel, some are implemented by calling hip builtin functions
static
inline
__device__
float
abs
(
float
x
)
{
return
::
abs
(
x
);
};
static
inline
__device__
double
abs
(
double
x
)
{
return
::
abs
(
x
);
};
static
inline
__device__
int8_t
abs
(
int8_t
x
)
{
int8_t
sgn
=
x
>>
(
8
-
1
);
return
(
x
^
sgn
)
-
sgn
;
};
static
inline
__device__
int32_t
abs
(
int32_t
x
)
{
int32_t
sgn
=
x
>>
(
32
-
1
);
return
(
x
^
sgn
)
-
sgn
;
};
static
inline
__device__
half_t
abs
(
half_t
x
)
{
return
::
__habs
(
x
);
};
static
inline
__device__
bool
isnan
(
float
x
)
{
return
::
isnan
(
x
);
};
static
inline
__device__
bool
isnan
(
double
x
)
{
return
::
isnan
(
x
);
};
static
inline
__device__
bool
isnan
(
int8_t
x
)
{
(
void
)
x
;
return
false
;
};
return
half_float
::
isnan
(
xx
);
static
inline
__device__
bool
isnan
(
int32_t
x
)
{
(
void
)
x
;
return
false
;
};
static
inline
__device__
bool
isnan
(
half_t
x
)
{
return
::
__hisnan
(
x
);
};
static
inline
__device__
float
sqrt
(
float
x
)
{
return
::
sqrtf
(
x
);
};
static
inline
__device__
double
sqrt
(
double
x
)
{
return
::
sqrt
(
x
);
};
}
// namespace math
}
// namespace ck
...
...
include/ck/utility/reduction_functions_accumulate.hpp
View file @
95ad9091
...
...
@@ -27,6 +27,7 @@
#define CK_REDUCTION_FUNCTIONS_BINOP_HPP
#include "data_type.hpp"
#include "math_v2.hpp"
#include "reduction_common.hpp"
#include "reduction_operator.hpp"
...
...
@@ -34,18 +35,6 @@
namespace
ck
{
namespace
detail
{
template
<
typename
T
>
static
inline
__device__
bool
is_nan
(
T
x
)
{
return
(
isnan
(
x
));
};
template
<
>
inline
__device__
bool
is_nan
<
half_t
>
(
half_t
x
)
{
return
(
__hisnan
(
x
));
};
template
<
bool
PropagateNan
,
typename
ReduceOperation
,
typename
AccDataType
>
struct
AccumulateWithNanCheck
;
...
...
@@ -53,7 +42,7 @@ template <typename ReduceOperation, typename AccDataType>
struct
AccumulateWithNanCheck
<
false
,
ReduceOperation
,
AccDataType
>
{
// cppcheck-suppress constParameter
__device__
static
inline
void
Calculate
(
AccDataType
&
accuVal
,
AccDataType
currVal
)
__host__
__device__
static
inline
void
Calculate
(
AccDataType
&
accuVal
,
AccDataType
currVal
)
{
ReduceOperation
{}(
accuVal
,
currVal
);
};
...
...
@@ -62,9 +51,11 @@ struct AccumulateWithNanCheck<false, ReduceOperation, AccDataType>
template
<
typename
ReduceOperation
,
typename
AccDataType
>
struct
AccumulateWithNanCheck
<
true
,
ReduceOperation
,
AccDataType
>
{
__device__
static
inline
void
Calculate
(
AccDataType
&
accuVal
,
AccDataType
currVal
)
__host__
__device__
static
inline
void
Calculate
(
AccDataType
&
accuVal
,
AccDataType
currVal
)
{
if
(
is_nan
(
currVal
))
using
ck
::
math
::
isnan
;
if
(
isnan
(
currVal
))
{
accuVal
=
currVal
;
}
...
...
@@ -81,7 +72,7 @@ struct AccumulateWithIndexAndNanCheck;
template
<
typename
ReduceOperation
,
typename
AccDataType
,
typename
IndexDataType
>
struct
AccumulateWithIndexAndNanCheck
<
false
,
ReduceOperation
,
AccDataType
,
IndexDataType
>
{
__device__
static
inline
void
__host__
__device__
static
inline
void
// cppcheck-suppress constParameter
Calculate
(
AccDataType
&
accuVal
,
AccDataType
currVal
,
...
...
@@ -101,12 +92,14 @@ template <typename ReduceOperation, typename AccDataType, typename IndexDataType
struct
AccumulateWithIndexAndNanCheck
<
true
,
ReduceOperation
,
AccDataType
,
IndexDataType
>
{
// The method is called when the ReduceOperation is indexable and the user asked for indices
__device__
static
inline
void
Calculate
(
AccDataType
&
accuVal
,
AccDataType
currVal
,
IndexDataType
&
accuIndex
,
IndexDataType
currIndex
)
__host__
__device__
static
inline
void
Calculate
(
AccDataType
&
accuVal
,
AccDataType
currVal
,
IndexDataType
&
accuIndex
,
IndexDataType
currIndex
)
{
if
(
is_nan
(
currVal
))
using
ck
::
math
::
isnan
;
if
(
isnan
(
currVal
))
{
accuVal
=
currVal
;
accuIndex
=
currIndex
;
...
...
include/ck/utility/reduction_operator.hpp
View file @
95ad9091
...
...
@@ -36,7 +36,7 @@ namespace reduce {
// Every binary operator used in reduction is represented by a templated functor class. Each functor
// class must provide at least
// three members:
// 1) Get
ReductionZero
Val() -- the interface to return the "identity element" for the binary
// 1) Get
Identity
Val
ue
() -- the interface to return the "identity element" for the binary
// operator, "identity element" is the unique
// element in the algebraic space that doesn't affect the value of other elements
// when operated against them, and the concept is similar to zero vector in
...
...
@@ -59,7 +59,7 @@ struct Add
{
using
dataType
=
T
;
__host__
__device__
static
constexpr
T
Get
ReductionZero
Val
()
{
return
static_cast
<
T
>
(
0.0
f
);
};
__host__
__device__
static
constexpr
T
Get
Identity
Val
ue
()
{
return
static_cast
<
T
>
(
0.0
f
);
};
__device__
static
constexpr
bool
IsCompatibleInMemoryDataOperation
(
InMemoryDataOperationEnum
operation
)
...
...
@@ -76,7 +76,7 @@ struct Mul
{
using
dataType
=
T
;
__host__
__device__
static
constexpr
T
Get
ReductionZero
Val
()
{
return
static_cast
<
T
>
(
1.0
f
);
};
__host__
__device__
static
constexpr
T
Get
Identity
Val
ue
()
{
return
static_cast
<
T
>
(
1.0
f
);
};
__device__
static
constexpr
bool
IsCompatibleInMemoryDataOperation
(
InMemoryDataOperationEnum
operation
)
...
...
@@ -92,7 +92,7 @@ struct Max
{
using
dataType
=
T
;
__host__
__device__
static
constexpr
T
Get
ReductionZero
Val
()
__host__
__device__
static
constexpr
T
Get
Identity
Val
ue
()
{
return
NumericLimits
<
T
>::
Lowest
();
};
...
...
@@ -125,10 +125,7 @@ struct Min
{
using
dataType
=
T
;
__host__
__device__
static
constexpr
T
GetReductionZeroVal
()
{
return
NumericLimits
<
T
>::
Max
();
};
__host__
__device__
static
constexpr
T
GetIdentityValue
()
{
return
NumericLimits
<
T
>::
Max
();
};
__device__
static
constexpr
bool
IsCompatibleInMemoryDataOperation
(
InMemoryDataOperationEnum
operation
)
...
...
@@ -158,7 +155,7 @@ struct AMax
{
using
dataType
=
T
;
__host__
__device__
static
constexpr
T
Get
ReductionZero
Val
()
{
return
static_cast
<
T
>
(
0.0
f
);
};
__host__
__device__
static
constexpr
T
Get
Identity
Val
ue
()
{
return
static_cast
<
T
>
(
0.0
f
);
};
__device__
static
constexpr
bool
IsCompatibleInMemoryDataOperation
(
InMemoryDataOperationEnum
operation
)
...
...
@@ -184,7 +181,7 @@ struct AMax
};
template
<
typename
T
>
T
Get
ReductionZero
ValueForInMemoryDataOperation
(
InMemoryDataOperationEnum
operation
)
T
Get
Identity
Value
ue
ForInMemoryDataOperation
(
InMemoryDataOperationEnum
operation
)
{
T
result
=
ck
::
type_convert
<
T
>
(
0.0
f
);
...
...
library/include/ck/library/host_tensor/host_reduce_util.hpp
deleted
100644 → 0
View file @
d2640676
/*******************************************************************************
*
* MIT License
*
* Copyright (c) 2020 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*******************************************************************************/
#ifndef GUARD_HOST_REDUCE_UTIL_HPP
#define GUARD_HOST_REDUCE_UTIL_HPP
#include <limits>
#include <cmath>
#include <functional>
#include "reduction_enums.hpp"
#include "data_type.hpp"
#include "math_v2.hpp"
namespace
ck
{
namespace
host_reduce
{
using
ck
::
NanPropagation
;
using
ck
::
ReduceTensorOp
;
template
<
typename
AccDataType
,
ReduceTensorOp
ReduceOpId
>
__host__
static
inline
std
::
function
<
void
(
AccDataType
&
)
>
PreUnaryOpFn
(
int
)
{
using
ck
::
math
::
abs
;
if
constexpr
(
ReduceOpId
==
ReduceTensorOp
::
NORM1
)
{
return
([
&
](
AccDataType
&
a_
)
{
a_
=
abs
(
a_
);
});
}
else
if
constexpr
(
ReduceOpId
==
ReduceTensorOp
::
NORM2
)
{
return
([
&
](
AccDataType
&
a_
)
{
a_
=
a_
*
a_
;
});
}
else
if
constexpr
(
ReduceOpId
==
ReduceTensorOp
::
AMAX
)
{
return
([
&
](
AccDataType
&
a_
)
{
a_
=
abs
(
a_
);
});
}
else
{
// ReduceTensorOp::AVG:
// ReduceTensorOp::ADD:
// ReduceTensorOp::MUL:
// ReduceTensorOp::MIN:
// ReduceTensorOp::MAX:
return
([
&
](
AccDataType
&
)
{});
};
};
template
<
typename
AccDataType
,
ReduceTensorOp
ReduceOpId
>
__host__
static
inline
std
::
function
<
void
(
AccDataType
&
)
>
PosUnaryOpFn
(
int32_t
divider
)
{
using
std
::
sqrt
;
if
constexpr
(
ReduceOpId
==
ReduceTensorOp
::
NORM2
)
{
return
([
&
](
AccDataType
&
a_
)
{
a_
=
sqrt
(
a_
);
});
}
else
if
constexpr
(
ReduceOpId
==
ReduceTensorOp
::
AVG
)
{
return
([
&
,
divider
](
AccDataType
&
a_
)
{
a_
=
a_
/
static_cast
<
AccDataType
>
(
static_cast
<
float
>
(
divider
));
});
}
else
{
// ReduceTensorOp::ADD:
// ReduceTensorOp::NORM1:
// ReduceTensorOp::MUL:
// ReduceTensorOp::MIN:
// ReduceTensorOp::MAX:
// ReduceTensorOp::AMAX:
return
([
&
](
AccDataType
&
)
{});
}
};
template
<
typename
AccDataType
,
ReduceTensorOp
ReduceOpId
>
__host__
static
inline
std
::
function
<
void
(
AccDataType
&
,
AccDataType
)
>
ReduceOpFn
()
{
if
constexpr
(
ReduceOpId
==
ReduceTensorOp
::
ADD
||
ReduceOpId
==
ReduceTensorOp
::
AVG
||
ReduceOpId
==
ReduceTensorOp
::
NORM1
||
ReduceOpId
==
ReduceTensorOp
::
NORM2
)
{
return
([
&
](
AccDataType
&
a_
,
AccDataType
b_
)
{
a_
=
a_
+
b_
;
});
}
else
if
constexpr
(
ReduceOpId
==
ReduceTensorOp
::
MUL
)
{
return
([
&
](
AccDataType
&
a_
,
AccDataType
b_
)
{
a_
=
a_
*
b_
;
});
}
else
if
constexpr
(
ReduceOpId
==
ReduceTensorOp
::
MIN
)
{
return
([
&
](
AccDataType
&
a_
,
AccDataType
b_
)
{
if
(
a_
>
b_
)
a_
=
b_
;
});
}
else
if
constexpr
(
ReduceOpId
==
ReduceTensorOp
::
MAX
||
ReduceOpId
==
ReduceTensorOp
::
AMAX
)
{
return
([
&
](
AccDataType
&
a_
,
AccDataType
b_
)
{
if
(
a_
<
b_
)
a_
=
b_
;
});
}
};
template
<
typename
AccDataType
,
ReduceTensorOp
ReduceOpId
>
__host__
static
inline
std
::
function
<
void
(
AccDataType
&
,
AccDataType
,
bool
&
changed
)
>
ReduceOpFn2
()
{
if
constexpr
(
ReduceOpId
==
ReduceTensorOp
::
MIN
)
{
return
([
&
](
AccDataType
&
a_
,
AccDataType
b_
,
bool
&
changed
)
{
if
(
a_
>
b_
)
{
a_
=
b_
;
changed
=
true
;
}
else
changed
=
false
;
});
}
else
if
constexpr
(
ReduceOpId
==
ReduceTensorOp
::
MAX
||
ReduceOpId
==
ReduceTensorOp
::
AMAX
)
{
return
([
&
](
AccDataType
&
a_
,
AccDataType
b_
,
bool
&
changed
)
{
if
(
a_
<
b_
)
{
a_
=
b_
;
changed
=
true
;
}
else
changed
=
false
;
});
}
else
{
// ReduceTensorOp::ADD:
// ReduceTensorOp::MUL:
// ReduceTensorOp::AVG:
// ReduceTensorOp::NORM1:
// ReduceTensorOp::NORM2:
return
(
std
::
function
<
void
(
AccDataType
&
,
AccDataType
,
bool
&
)
>
{});
};
};
template
<
typename
AccDataType
,
ReduceTensorOp
ReduceOpId
>
__host__
static
inline
AccDataType
ReduceOpZeroVal
()
{
if
constexpr
(
ReduceOpId
==
ReduceTensorOp
::
MUL
)
{
return
(
static_cast
<
AccDataType
>
(
1.0
f
));
}
else
if
constexpr
(
ReduceOpId
==
ReduceTensorOp
::
MIN
)
{
return
(
ck
::
NumericLimits
<
AccDataType
>::
Max
());
}
else
if
constexpr
(
ReduceOpId
==
ReduceTensorOp
::
MAX
)
{
return
(
ck
::
NumericLimits
<
AccDataType
>::
Lowest
());
}
else
if
constexpr
(
ReduceOpId
==
ReduceTensorOp
::
AMAX
)
{
return
(
static_cast
<
AccDataType
>
(
0.0
f
));
}
else
{
// ReduceTensorOp::ADD
// ReduceTensorOp::AVG
// ReduceTensorOp::NORM1
// ReduceTensorOp::NORM2
return
(
static_cast
<
AccDataType
>
(
0.0
f
));
};
};
template
<
typename
AccDataType
,
bool
PropagateNan
>
__host__
static
inline
void
binop_with_nan_check
(
std
::
function
<
void
(
AccDataType
&
,
AccDataType
)
>
opReduce
,
AccDataType
&
accuVal
,
AccDataType
currVal
)
{
using
ck
::
math
::
isnan
;
if
constexpr
(
!
PropagateNan
)
{
opReduce
(
accuVal
,
currVal
);
}
else
{
if
(
isnan
(
currVal
))
accuVal
=
currVal
;
else
opReduce
(
accuVal
,
currVal
);
};
};
template
<
typename
AccDataType
,
typename
IndexDataType
,
bool
PropagateNan
>
__host__
static
inline
void
binop_with_index_and_nan_check
(
std
::
function
<
void
(
AccDataType
&
,
AccDataType
,
bool
&
)
>
opReduce
,
AccDataType
&
accuVal
,
AccDataType
currVal
,
IndexDataType
&
accuIndex
,
IndexDataType
currIndex
)
{
using
ck
::
math
::
isnan
;
if
constexpr
(
!
PropagateNan
)
{
bool
changed
;
opReduce
(
accuVal
,
currVal
,
changed
);
if
(
changed
)
accuIndex
=
currIndex
;
}
else
{
if
(
isnan
(
currVal
))
{
accuVal
=
currVal
;
accuIndex
=
currIndex
;
}
else
{
bool
changed
;
opReduce
(
accuVal
,
currVal
,
changed
);
if
(
changed
)
accuIndex
=
currIndex
;
};
};
};
};
// namespace host_reduce
};
// namespace ck
#endif
library/include/ck/library/host_tensor/host_reduction.hpp
View file @
95ad9091
...
...
@@ -33,10 +33,10 @@
#include "reduction_enums.hpp"
#include "reduction_common.hpp"
#include "host_reduce_util.hpp"
#include "host_common_util.hpp"
#include "host_tensor.hpp"
#include "data_type.hpp"
#include "reduction_functions_accumulate.hpp"
template
<
int
NDim
>
static
void
get_all_indexes
(
const
std
::
array
<
size_t
,
NDim
>&
dimLengths
,
...
...
@@ -106,11 +106,13 @@ static size_t get_offset_from_index(const std::vector<size_t>& strides,
template
<
typename
InDataType
,
typename
AccDataType
,
typename
OutDataType
,
ck
::
ReduceTensorOp
ReduceOpId
,
typename
ReduceOperation
,
typename
InElementwiseOperation
,
typename
AccElementwiseOperation
,
int
Rank
,
int
NumReduceDim
,
bool
PropagateNan
,
bool
NeedIndices
>
bool
OutputIndex
>
struct
ReductionHost
{
using
IndexDataType
=
int32_t
;
...
...
@@ -122,8 +124,6 @@ struct ReductionHost
std
::
vector
<
int
>
reduceDims
;
IndexDataType
divider
;
std
::
function
<
void
(
AccDataType
&
)
>
preUnaryOp
;
std
::
function
<
void
(
AccDataType
&
)
>
posUnaryOp
;
std
::
array
<
size_t
,
NumReduceDim
>
reduceLengths
;
std
::
array
<
size_t
,
NumReduceDim
>
reduceStrides
;
std
::
array
<
size_t
,
NumInvariantDim
>
invariantLengths
;
...
...
@@ -137,9 +137,6 @@ struct ReductionHost
const
std
::
vector
<
int
>&
invariantDims_
,
const
std
::
vector
<
int
>&
reduceDims_
)
{
using
ck
::
host_reduce
::
PosUnaryOpFn
;
using
ck
::
host_reduce
::
PreUnaryOpFn
;
// this->outLengths = to_int_vector(outDesc.GetLengths());
this
->
outStrides
=
outDesc
.
GetStrides
();
...
...
@@ -171,9 +168,6 @@ struct ReductionHost
invariant_dim_indexes
.
clear
();
get_all_indexes
<
NumInvariantDim
>
(
invariantLengths
,
invariant_dim_indexes
);
};
preUnaryOp
=
PreUnaryOpFn
<
AccDataType
,
ReduceOpId
>
(
divider
);
posUnaryOp
=
PosUnaryOpFn
<
AccDataType
,
ReduceOpId
>
(
divider
);
};
void
Run
(
float
alpha
,
...
...
@@ -182,7 +176,7 @@ struct ReductionHost
OutDataType
*
out_data
,
IndexDataType
*
out_indices
)
{
if
constexpr
(
NeedIndices
)
if
constexpr
(
OutputIndex
)
{
RunImpl_with_index
(
alpha
,
in_data
,
beta
,
out_data
,
out_indices
);
}
...
...
@@ -201,15 +195,17 @@ struct ReductionHost
using
ck
::
float_equal_one
;
using
ck
::
float_equal_zero
;
using
ck
::
type_convert
;
using
ck
::
host_reduce
::
binop_with_index_and_nan_check
;
using
ck
::
host_reduce
::
ReduceOpFn2
;
using
ck
::
host_reduce
::
ReduceOpZeroVal
;
auto
opReduce2
=
ReduceOpFn2
<
AccDataType
,
ReduceOpId
>
();
using
Accumulation
=
ck
::
detail
::
AccumulateWithIndexAndNanCheck
<
PropagateNan
,
ReduceOperation
,
AccDataType
,
IndexDataType
>
;
InElementwiseOperation
in_elementwise_op
(
divider
);
AccElementwiseOperation
acc_elementwise_op
(
divider
);
if
constexpr
(
NumInvariantDim
==
0
)
{
AccDataType
accuVal
=
ReduceOp
Z
er
oVal
<
AccDataType
,
ReduceOpId
>
();
AccDataType
accuVal
=
ReduceOper
ation
::
GetIdentityValue
();
IndexDataType
accuIndex
=
0
;
for
(
std
::
size_t
i
=
0
;
i
<
reduce_dim_indexes
.
size
();
i
++
)
...
...
@@ -219,15 +215,14 @@ struct ReductionHost
auto
currVal
=
type_convert
<
AccDataType
>
(
in_data
[
offset_reduce
]);
preUnaryOp
(
currVal
);
in_elementwise_op
(
currVal
,
currVal
);
auto
currIndex
=
static_cast
<
IndexDataType
>
(
i
);
binop_with_index_and_nan_check
<
AccDataType
,
IndexDataType
,
PropagateNan
>
(
opReduce2
,
accuVal
,
currVal
,
accuIndex
,
currIndex
);
Accumulation
::
Calculate
(
accuVal
,
currVal
,
accuIndex
,
currIndex
);
};
posUnaryOp
(
accuVal
);
acc_elementwise_op
(
accuVal
,
accuVal
);
if
(
!
float_equal_one
{}(
alpha
))
accuVal
*=
type_convert
<
AccDataType
>
(
alpha
);
...
...
@@ -241,7 +236,7 @@ struct ReductionHost
else
{
auto
thread_reduce_func
=
[
&
](
auto
invariant_index
)
{
AccDataType
accuVal
=
ReduceOp
Z
er
oVal
<
AccDataType
,
ReduceOpId
>
();
AccDataType
accuVal
=
ReduceOper
ation
::
GetIdentityValue
();
IndexDataType
accuIndex
=
0
;
auto
offset_invariant
=
...
...
@@ -255,15 +250,14 @@ struct ReductionHost
auto
currVal
=
type_convert
<
AccDataType
>
(
in_data
[
offset_invariant
+
offset_reduce
]);
preUnaryOp
(
currVal
);
in_elementwise_op
(
currVal
,
currVal
);
auto
currIndex
=
static_cast
<
IndexDataType
>
(
i
);
binop_with_index_and_nan_check
<
AccDataType
,
IndexDataType
,
PropagateNan
>
(
opReduce2
,
accuVal
,
currVal
,
accuIndex
,
currIndex
);
Accumulation
::
Calculate
(
accuVal
,
currVal
,
accuIndex
,
currIndex
);
};
posUnaryOp
(
accuVal
);
acc_elementwise_op
(
accuVal
,
accuVal
);
if
(
!
float_equal_one
{}(
alpha
))
accuVal
*=
type_convert
<
AccDataType
>
(
alpha
);
...
...
@@ -308,15 +302,16 @@ struct ReductionHost
using
ck
::
float_equal_one
;
using
ck
::
float_equal_zero
;
using
ck
::
type_convert
;
using
ck
::
host_reduce
::
binop_with_nan_check
;
using
ck
::
host_reduce
::
ReduceOpFn
;
using
ck
::
host_reduce
::
ReduceOpZeroVal
;
auto
opReduce
=
ReduceOpFn
<
AccDataType
,
ReduceOpId
>
();
using
Accumulation
=
ck
::
detail
::
AccumulateWithNanCheck
<
PropagateNan
,
ReduceOperation
,
AccDataType
>
;
InElementwiseOperation
in_elementwise_op
(
divider
);
AccElementwiseOperation
acc_elementwise_op
(
divider
);
if
constexpr
(
NumInvariantDim
==
0
)
{
AccDataType
accuVal
=
ReduceOp
Z
er
oVal
<
AccDataType
,
ReduceOpId
>
();
AccDataType
accuVal
=
ReduceOper
ation
::
GetIdentityValue
();
for
(
const
auto
&
reduce_index
:
reduce_dim_indexes
)
{
...
...
@@ -325,12 +320,12 @@ struct ReductionHost
auto
currVal
=
type_convert
<
AccDataType
>
(
in_data
[
offset_reduce
]);
preUnaryOp
(
currVal
);
in_elementwise_op
(
currVal
,
currVal
);
binop_with_nan_check
<
AccDataType
,
PropagateNan
>
(
opReduce
,
accuVal
,
currVal
);
Accumulation
::
Calculate
(
accuVal
,
currVal
);
};
posUnaryOp
(
accuVal
);
acc_elementwise_op
(
accuVal
,
accuVal
);
if
(
!
float_equal_one
{}(
alpha
))
accuVal
*=
type_convert
<
AccDataType
>
(
alpha
);
...
...
@@ -343,7 +338,7 @@ struct ReductionHost
else
{
auto
thread_reduce_func
=
[
&
](
auto
invariant_index
)
{
AccDataType
accuVal
=
ReduceOp
Z
er
oVal
<
AccDataType
,
ReduceOpId
>
();
AccDataType
accuVal
=
ReduceOper
ation
::
GetIdentityValue
();
auto
offset_invariant
=
get_offset_from_index
<
NumInvariantDim
>
(
invariantStrides
,
invariant_index
);
...
...
@@ -356,12 +351,12 @@ struct ReductionHost
auto
currVal
=
type_convert
<
AccDataType
>
(
in_data
[
offset_invariant
+
offset_reduce
]);
preUnaryOp
(
currVal
);
in_elementwise_op
(
currVal
,
currVal
);
binop_with_nan_check
<
AccDataType
,
PropagateNan
>
(
opReduce
,
accuVal
,
currVal
);
Accumulation
::
Calculate
(
accuVal
,
currVal
);
};
posUnaryOp
(
accuVal
);
acc_elementwise_op
(
accuVal
,
accuVal
);
if
(
!
float_equal_one
{}(
alpha
))
accuVal
*=
type_convert
<
AccDataType
>
(
alpha
);
...
...
library/include/ck/library/reference_tensor_operation/cpu/reference_cgemm.hpp
0 → 100644
View file @
95ad9091
/*******************************************************************************
*
* MIT License
*
* Copyright (c) 2022 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*******************************************************************************/
#pragma once
#include <iostream>
#include <sstream>
#include "device_base.hpp"
#include "host_tensor.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
host
{
// FIXME: support arbitrary elementwise operation for A/B/C
template
<
typename
ADataType
,
typename
BDataType
,
typename
CDataType
,
typename
AElementwiseOperation
,
typename
BElementwiseOperation
,
typename
CElementwiseOperation
,
enable_if_t
<
is_same_v
<
AElementwiseOperation
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
>
&&
is_same_v
<
BElementwiseOperation
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
>
&&
is_same_v
<
CElementwiseOperation
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
>
,
bool
>
=
false
>
struct
ReferenceCGemm
:
public
device
::
BaseOperator
{
// Argument
struct
Argument
:
public
device
::
BaseArgument
{
Argument
(
const
Tensor
<
ADataType
>&
a_m_k_real
,
const
Tensor
<
ADataType
>&
a_m_k_imag
,
const
Tensor
<
BDataType
>&
b_k_n_real
,
const
Tensor
<
BDataType
>&
b_k_n_imag
,
Tensor
<
CDataType
>&
c_m_n_real
,
Tensor
<
CDataType
>&
c_m_n_imag
,
AElementwiseOperation
a_element_op
,
BElementwiseOperation
b_element_op
,
CElementwiseOperation
c_element_op
)
:
a_m_k_real_
{
a_m_k_real
},
a_m_k_imag_
{
a_m_k_imag
},
b_k_n_real_
{
b_k_n_real
},
b_k_n_imag_
{
b_k_n_imag
},
c_m_n_real_
{
c_m_n_real
},
c_m_n_imag_
{
c_m_n_imag
},
a_element_op_
{
a_element_op
},
b_element_op_
{
b_element_op
},
c_element_op_
{
c_element_op
}
{
}
const
Tensor
<
ADataType
>&
a_m_k_real_
;
const
Tensor
<
ADataType
>&
a_m_k_imag_
;
const
Tensor
<
BDataType
>&
b_k_n_real_
;
const
Tensor
<
BDataType
>&
b_k_n_imag_
;
Tensor
<
CDataType
>&
c_m_n_real_
;
Tensor
<
CDataType
>&
c_m_n_imag_
;
AElementwiseOperation
a_element_op_
;
BElementwiseOperation
b_element_op_
;
CElementwiseOperation
c_element_op_
;
};
// Invoker
struct
Invoker
:
public
device
::
BaseInvoker
{
using
Argument
=
ReferenceCGemm
::
Argument
;
float
Run
(
const
Argument
&
arg
)
{
const
std
::
size_t
K
=
arg
.
a_m_k_real_
.
mDesc
.
GetLengths
()[
1
];
if
(
K
!=
arg
.
a_m_k_imag_
.
mDesc
.
GetLengths
()[
1
])
{
throw
std
::
runtime_error
(
"wrong! Incompatible real and imag sizes in CGEMM"
);
}
auto
f_mk_kn_mn_real
=
[
&
](
auto
m
,
auto
n
)
{
float
v_c_real
=
0
;
for
(
std
::
size_t
k
=
0
;
k
<
K
;
++
k
)
{
float
v_a_real
=
ck
::
type_convert
<
float
>
(
arg
.
a_m_k_real_
(
m
,
k
));
float
v_a_imag
=
ck
::
type_convert
<
float
>
(
arg
.
a_m_k_imag_
(
m
,
k
));
float
v_b_real
=
ck
::
type_convert
<
float
>
(
arg
.
b_k_n_real_
(
k
,
n
));
float
v_b_imag
=
ck
::
type_convert
<
float
>
(
arg
.
b_k_n_imag_
(
k
,
n
));
v_c_real
+=
v_a_real
*
v_b_real
-
v_a_imag
*
v_b_imag
;
}
arg
.
c_m_n_real_
(
m
,
n
)
=
v_c_real
;
};
auto
f_mk_kn_mn_imag
=
[
&
](
auto
m
,
auto
n
)
{
float
v_c_imag
=
0
;
for
(
std
::
size_t
k
=
0
;
k
<
K
;
++
k
)
{
float
v_a_real
=
ck
::
type_convert
<
float
>
(
arg
.
a_m_k_real_
(
m
,
k
));
float
v_a_imag
=
ck
::
type_convert
<
float
>
(
arg
.
a_m_k_imag_
(
m
,
k
));
float
v_b_real
=
ck
::
type_convert
<
float
>
(
arg
.
b_k_n_real_
(
k
,
n
));
float
v_b_imag
=
ck
::
type_convert
<
float
>
(
arg
.
b_k_n_imag_
(
k
,
n
));
v_c_imag
+=
v_a_real
*
v_b_imag
+
v_a_imag
*
v_b_real
;
}
arg
.
c_m_n_imag_
(
m
,
n
)
=
v_c_imag
;
};
make_ParallelTensorFunctor
(
f_mk_kn_mn_real
,
arg
.
c_m_n_real_
.
mDesc
.
GetLengths
()[
0
],
arg
.
c_m_n_real_
.
mDesc
.
GetLengths
()[
1
])(
std
::
thread
::
hardware_concurrency
());
make_ParallelTensorFunctor
(
f_mk_kn_mn_imag
,
arg
.
c_m_n_imag_
.
mDesc
.
GetLengths
()[
0
],
arg
.
c_m_n_imag_
.
mDesc
.
GetLengths
()[
1
])(
std
::
thread
::
hardware_concurrency
());
return
0
;
}
float
Run
(
const
device
::
BaseArgument
*
p_arg
,
const
StreamConfig
&
/* stream_config */
=
StreamConfig
{})
override
{
return
Run
(
*
dynamic_cast
<
const
Argument
*>
(
p_arg
));
}
};
static
constexpr
bool
IsValidCompilationParameter
()
{
// TODO: properly implement this check
return
true
;
}
bool
IsSupportedArgument
(
const
device
::
BaseArgument
*
)
override
{
return
true
;
}
static
auto
MakeArgument
(
const
Tensor
<
ADataType
>&
a_m_k_real
,
const
Tensor
<
ADataType
>&
a_m_k_imag
,
const
Tensor
<
BDataType
>&
b_k_n_real
,
const
Tensor
<
BDataType
>&
b_k_n_imag
,
Tensor
<
CDataType
>&
c_m_n_real
,
Tensor
<
CDataType
>&
c_m_n_imag
,
AElementwiseOperation
a_element_op
,
BElementwiseOperation
b_element_op
,
CElementwiseOperation
c_element_op
)
{
return
Argument
{
a_m_k_real
,
a_m_k_imag
,
b_k_n_real
,
b_k_n_imag
,
c_m_n_real
,
c_m_n_imag
,
a_element_op
,
b_element_op
,
c_element_op
};
}
static
auto
MakeInvoker
()
{
return
Invoker
{};
}
virtual
std
::
unique_ptr
<
device
::
BaseInvoker
>
MakeInvokerPointer
()
{
return
std
::
make_unique
<
Invoker
>
(
Invoker
{});
}
std
::
string
GetTypeString
()
const
override
{
auto
str
=
std
::
stringstream
();
// clang-format off
str
<<
"ReferenceCGemm"
<<
std
::
endl
;
// clang-format on
return
str
.
str
();
}
};
}
// namespace host
}
// namespace tensor_operation
}
// namespace ck
profiler/include/profile_batched_gemm_reduce_impl.hpp
View file @
95ad9091
...
...
@@ -171,8 +171,8 @@ bool profile_batched_gemm_reduce_impl(int do_verification,
{
for
(
int
m
=
0
;
m
<
M
;
++
m
)
{
float
d0_acc
=
d0_reduce_op
.
Get
ReductionZero
Val
();
float
d1_acc
=
d1_reduce_op
.
Get
ReductionZero
Val
();
float
d0_acc
=
d0_reduce_op
.
Get
Identity
Val
ue
();
float
d1_acc
=
d1_reduce_op
.
Get
Identity
Val
ue
();
for
(
int
n
=
0
;
n
<
N
;
++
n
)
{
...
...
profiler/include/profile_gemm_reduce_impl.hpp
View file @
95ad9091
...
...
@@ -165,8 +165,8 @@ bool profile_gemm_reduce_impl(int do_verification,
for
(
int
m
=
0
;
m
<
M
;
++
m
)
{
float
d0_acc
=
d0_reduce_op
.
Get
ReductionZero
Val
();
float
d1_acc
=
d1_reduce_op
.
Get
ReductionZero
Val
();
float
d0_acc
=
d0_reduce_op
.
Get
Identity
Val
ue
();
float
d1_acc
=
d1_reduce_op
.
Get
Identity
Val
ue
();
for
(
int
n
=
0
;
n
<
N
;
++
n
)
{
...
...
Prev
1
2
3
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment