Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
b2290854
Commit
b2290854
authored
May 27, 2022
by
rocking
Browse files
Merge commit '
3e6c2610
' into gemm_norm
parents
253f7ef2
3e6c2610
Changes
201
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
375 additions
and
171 deletions
+375
-171
include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r3.hpp
...k/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r3.hpp
+5
-4
include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r4.hpp
...k/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r4.hpp
+3
-3
include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r4r2.hpp
...tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r4r2.hpp
+3
-3
include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r1.hpp
...k/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r1.hpp
+4
-4
include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r2.hpp
...k/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r2.hpp
+4
-4
include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r3.hpp
...k/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r3.hpp
+4
-4
include/ck/tensor_operation/gpu/thread/threadwise_contraction_dl.hpp
...tensor_operation/gpu/thread/threadwise_contraction_dl.hpp
+5
-8
include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v5r1.hpp
...tion/gpu/thread/threadwise_tensor_slice_transfer_v5r1.hpp
+1
-3
include/ck/tensor_operation/gpu/warp/xdlops_gemm.hpp
include/ck/tensor_operation/gpu/warp/xdlops_gemm.hpp
+33
-3
include/ck/utility/amd_xdlops.hpp
include/ck/utility/amd_xdlops.hpp
+19
-0
include/ck/utility/generic_memory_space_atomic.hpp
include/ck/utility/generic_memory_space_atomic.hpp
+23
-0
include/ck/utility/inner_product.hpp
include/ck/utility/inner_product.hpp
+2
-5
include/ck/utility/reduction_operator.hpp
include/ck/utility/reduction_operator.hpp
+51
-7
include/ck/utility/static_buffer.hpp
include/ck/utility/static_buffer.hpp
+7
-2
library/include/ck/library/host_tensor/host_common_util.hpp
library/include/ck/library/host_tensor/host_common_util.hpp
+102
-0
library/include/ck/library/host_tensor/host_reduce_util.hpp
library/include/ck/library/host_tensor/host_reduce_util.hpp
+7
-19
library/include/ck/library/host_tensor/host_reduction.hpp
library/include/ck/library/host_tensor/host_reduction.hpp
+8
-10
library/include/ck/library/reference_tensor_operation/cpu/reference_gemm.hpp
...library/reference_tensor_operation/cpu/reference_gemm.hpp
+7
-6
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance.hpp
..._operation_instance/gpu/reduce/device_reduce_instance.hpp
+1
-16
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp
..._instance/gpu/reduce/device_reduce_instance_blockwise.hpp
+86
-70
No files found.
include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r3.hpp
View file @
b2290854
#pragma once
#include "common_header.hpp"
#include "multi_index_transform_helper.hpp"
#include "tensor_descriptor.hpp"
...
...
@@ -287,11 +288,11 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3
}
// return block_id to C matrix tile idx (m0, n0) mapping
__host__
__device__
static
constexpr
auto
MakeDefaultBlock2CTileMap
(
const
CGridDesc_M_N
&
c_grid_desc_m_n
,
index_t
M01
,
index_t
N01
)
__host__
__device__
static
constexpr
auto
MakeDefaultBlock2CTileMap
(
const
CGridDesc_M_N
&
c_grid_desc_m_n
,
index_t
/* M01 */
,
index_t
/* N01 */
)
{
return
BlockToCTileMap_M00_N0
0
_M01
_N01
<
MPerBlock
,
NPerBlock
,
CGridDesc_M_N
>
(
c_grid_desc_m_n
,
M01
,
N01
);
return
BlockToCTileMap_M00_N0_M01
Adapt
<
MPerBlock
,
NPerBlock
,
CGridDesc_M_N
>
(
c_grid_desc_m_n
);
}
using
CGridDesc_M0_N0_M1_N1_M2_M3_M4_N2
=
...
...
include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r4.hpp
View file @
b2290854
...
...
@@ -265,10 +265,10 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4
// return block_id to C matrix tile idx (m0, n0) mapping
__host__
__device__
static
constexpr
auto
MakeCBlockClusterAdaptor
(
const
CMNGridDesc
&
c_m_n_grid_desc
,
index_t
M01
,
index_t
N01
,
index_t
KBatch
)
const
CMNGridDesc
&
c_m_n_grid_desc
,
index_t
/* M01 */
,
index_t
/* N01 */
,
index_t
KBatch
)
{
return
BlockToCTileMap_KSplit_M00_N0
0
_M01
_N01
<
MPerBlock
,
NPerBlock
,
CMNGridDesc
>
(
c_m_n_grid_desc
,
M01
,
N01
,
KBatch
);
return
BlockToCTileMap_KSplit_M00_N0_M01
Adapt
<
MPerBlock
,
NPerBlock
,
CMNGridDesc
>
(
c_m_n_grid_desc
,
8
,
KBatch
);
}
using
CM0N0M1N1M2M3M4N2GridDesc
=
decltype
(
MakeCM0N0M1N1M2M3M4N2GridDescriptor
(
CMNGridDesc
{}));
...
...
include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r4r2.hpp
View file @
b2290854
...
...
@@ -239,10 +239,10 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2
// return block_id to C matrix tile idx (m0, n0) mapping
__host__
__device__
static
constexpr
auto
MakeCBlockClusterAdaptor
(
const
CMNGridDesc
&
c_m_n_grid_desc
,
index_t
M01
,
index_t
N01
,
index_t
KBatch
)
const
CMNGridDesc
&
c_m_n_grid_desc
,
index_t
/* M01 */
,
index_t
/* N01 */
,
index_t
KBatch
)
{
return
BlockToCTileMap_KSplit_M00_N0
0
_M01
_N01
<
MPerBlock
,
NPerBlock
,
CMNGridDesc
>
(
c_m_n_grid_desc
,
M01
,
N01
,
KBatch
);
return
BlockToCTileMap_KSplit_M00_N0_M01
Adapt
<
MPerBlock
,
NPerBlock
,
CMNGridDesc
>
(
c_m_n_grid_desc
,
8
,
KBatch
);
}
__host__
__device__
static
constexpr
auto
...
...
include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r1.hpp
View file @
b2290854
...
...
@@ -300,11 +300,11 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v3r1
}
// return block_id to C matrix tile idx (m0, n0) mapping
__host__
__device__
static
constexpr
auto
MakeDefaultBlock2CTileMap
(
const
CGridDesc_M_N
&
c_grid_desc_m_n
,
index_t
M01
,
index_t
N01
)
__host__
__device__
static
constexpr
auto
MakeDefaultBlock2CTileMap
(
const
CGridDesc_M_N
&
c_grid_desc_m_n
,
index_t
/* M01 */
,
index_t
/* N01 */
)
{
return
BlockToCTileMap_M00_N0
0
_M01
_N01
<
MPerBlock
,
NPerBlock
,
CGridDesc_M_N
>
(
c_grid_desc_m_n
,
M01
,
N01
);
return
BlockToCTileMap_M00_N0_M01
Adapt
<
MPerBlock
,
NPerBlock
,
CGridDesc_M_N
>
(
c_grid_desc_m_n
);
}
using
CGridDescriptor_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl
=
remove_cvref_t
<
decltype
(
...
...
include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r2.hpp
View file @
b2290854
...
...
@@ -309,11 +309,11 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v3r2
}
// return block_id to C matrix tile idx (m0, n0) mapping
__host__
__device__
static
constexpr
auto
MakeDefaultBlock2CTileMap
(
const
CGridDesc_M_N
&
c_grid_desc_m_n
,
index_t
M01
,
index_t
N01
)
__host__
__device__
static
constexpr
auto
MakeDefaultBlock2CTileMap
(
const
CGridDesc_M_N
&
c_grid_desc_m_n
,
index_t
/* M01 */
,
index_t
/* N01 */
)
{
return
BlockToCTileMap_M00_N0
0
_M01
_N01
<
MPerBlock
,
NPerBlock
,
CGridDesc_M_N
>
(
c_grid_desc_m_n
,
M01
,
N01
);
return
BlockToCTileMap_M00_N0_M01
Adapt
<
MPerBlock
,
NPerBlock
,
CGridDesc_M_N
>
(
c_grid_desc_m_n
);
}
using
CGridDescriptor_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl
=
...
...
include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r3.hpp
View file @
b2290854
...
...
@@ -316,11 +316,11 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v3r3
}
// return block_id to C matrix tile idx (m0, n0) mapping
__host__
__device__
static
constexpr
auto
MakeDefaultBlock2CTileMap
(
const
CGridDesc_M_N
&
c_grid_desc_m_n
,
index_t
M01
,
index_t
N01
)
__host__
__device__
static
constexpr
auto
MakeDefaultBlock2CTileMap
(
const
CGridDesc_M_N
&
c_grid_desc_m_n
,
index_t
/* M01 */
,
index_t
/* N01 */
)
{
return
BlockToCTileMap_M00_N0
0
_M01
_N01
<
MPerBlock
,
NPerBlock
,
CGridDesc_M_N
>
(
c_grid_desc_m_n
,
M01
,
N01
);
return
BlockToCTileMap_M00_N0_M01
Adapt
<
MPerBlock
,
NPerBlock
,
CGridDesc_M_N
>
(
c_grid_desc_m_n
);
}
using
CGridDescriptor_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl
=
remove_cvref_t
<
decltype
(
...
...
include/ck/tensor_operation/gpu/thread/threadwise_contraction_dl
ops
.hpp
→
include/ck/tensor_operation/gpu/thread/threadwise_contraction_dl.hpp
View file @
b2290854
#ifndef CK_THREADWISE_CONTRACTION_DLOPS_HPP
#define CK_THREADWISE_CONTRACTION_DLOPS_HPP
#pragma once
#include "common_header.hpp"
#include "math.hpp"
...
...
@@ -25,9 +23,9 @@ template <typename FloatA,
BThreadDesc_TK0_TN0_TN1_TK1
::
IsKnownAtCompileTime
()
&&
CThreadDesc_TM0_TM1_TN0_TN1
::
IsKnownAtCompileTime
(),
bool
>
::
type
=
false
>
struct
ThreadwiseGemmDl
ops
_km0m1_kn0n1_m0m1n0n1
struct
ThreadwiseGemmDl_km0m1_kn0n1_m0m1n0n1
{
__device__
constexpr
ThreadwiseGemmDl
ops
_km0m1_kn0n1_m0m1n0n1
()
__device__
constexpr
ThreadwiseGemmDl_km0m1_kn0n1_m0m1n0n1
()
{
static_assert
(
AThreadDesc_TK0_TM0_TM1_TK1
::
IsKnownAtCompileTime
()
&&
BThreadDesc_TK0_TN0_TN1_TK1
::
IsKnownAtCompileTime
()
&&
...
...
@@ -124,9 +122,9 @@ template <typename FloatA,
BThreadDesc_TK0_TN0_TN1_TK1
::
IsKnownAtCompileTime
()
&&
CThreadDesc_TM0_TM1_TN0_TN1
::
IsKnownAtCompileTime
(),
bool
>
::
type
=
false
>
struct
ThreadwiseContractionDl
ops
_A_TK0_TM0_TM1_TK1_B_TK0_TN0_TN1_TK1_C_TM0_TM1_TN0_TN1
struct
ThreadwiseContractionDl_A_TK0_TM0_TM1_TK1_B_TK0_TN0_TN1_TK1_C_TM0_TM1_TN0_TN1
{
__device__
constexpr
ThreadwiseContractionDl
ops
_A_TK0_TM0_TM1_TK1_B_TK0_TN0_TN1_TK1_C_TM0_TM1_TN0_TN1
()
__device__
constexpr
ThreadwiseContractionDl_A_TK0_TM0_TM1_TK1_B_TK0_TN0_TN1_TK1_C_TM0_TM1_TN0_TN1
()
{
static_assert
(
AThreadDesc_TK0_TM0_TM1_TK1
::
IsKnownAtCompileTime
()
&&
BThreadDesc_TK0_TN0_TN1_TK1
::
IsKnownAtCompileTime
()
&&
...
...
@@ -220,4 +218,3 @@ struct ThreadwiseContractionDlops_A_TK0_TM0_TM1_TK1_B_TK0_TN0_TN1_TK1_C_TM0_TM1_
};
}
// namespace ck
#endif
include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v5r1.hpp
View file @
b2290854
#ifndef CK_THREADWISE_TENSOR_SLICE_TRANSFER_V5R1_HPP
#define CK_THREADWISE_TENSOR_SLICE_TRANSFER_V5R1_HPP
#pragma once
#include "common_header.hpp"
#include "tensor_descriptor.hpp"
...
...
@@ -609,4 +608,3 @@ struct ThreadwiseTensorSliceTransfer_v5r1
};
}
// namespace ck
#endif
include/ck/tensor_operation/gpu/warp/xdlops_gemm.hpp
View file @
b2290854
...
...
@@ -25,6 +25,7 @@ enum struct MfmaInstr
mfma_f32_16x16x8bf16
,
mfma_i32_32x32x8i8
,
mfma_i32_16x16x16i8
,
mfma_f64_16x16x4f64
};
template
<
MfmaInstr
instr
>
...
...
@@ -383,12 +384,40 @@ struct mfma_type<MfmaInstr::mfma_i32_16x16x16i8>
}
};
template
<
>
struct
mfma_type
<
MfmaInstr
::
mfma_f64_16x16x4f64
>
{
static
constexpr
index_t
group_size
=
1
;
static
constexpr
index_t
num_groups_per_blk
=
4
;
static
constexpr
index_t
num_regs_per_blk
=
4
;
// group_size * num_groups_per_blk;
static
constexpr
index_t
num_threads_per_blk
=
16
;
static
constexpr
index_t
wave_size
=
64
;
static
constexpr
index_t
num_input_blks
=
4
;
// wave_size / num_threads_per_blk;
static
constexpr
index_t
num_output_blks
=
1
;
static
constexpr
index_t
m_per_blk
=
16
;
static
constexpr
index_t
n_per_blk
=
16
;
static
constexpr
index_t
k_per_blk
=
1
;
static
constexpr
bool
is_k_reduction
=
true
;
template
<
index_t
MPerXdlops
,
index_t
NPerXdlops
,
class
FloatA
,
class
FloatB
,
class
FloatC
>
__device__
void
run
(
const
FloatA
&
a
,
const
FloatB
&
b
,
FloatC
&
reg_c
)
const
{
intrin_mfma_f64_16x16x4f64
<
MPerXdlops
,
NPerXdlops
>::
Run
(
a
,
b
,
reg_c
);
}
};
template
<
typename
base_type
,
index_t
MPerXdlops
,
index_t
NPerXdlops
>
struct
MfmaSelector
{
template
<
typename
base_type_
,
index_t
MPerXdlops_
,
index_t
NPerXdlops_
>
static
constexpr
auto
GetMfma
();
template
<
>
static
constexpr
auto
GetMfma
<
double
,
16
,
16
>
()
{
return
MfmaInstr
::
mfma_f64_16x16x4f64
;
}
template
<
>
static
constexpr
auto
GetMfma
<
float
,
64
,
64
>
()
{
...
...
@@ -661,9 +690,10 @@ struct XdlopsGemm
template
<
class
FloatA
,
class
FloatB
,
class
FloatC
>
__device__
void
Run
(
const
FloatA
&
p_a_wave
,
const
FloatB
&
p_b_wave
,
FloatC
&
p_c_thread
)
const
{
static_assert
(
is_same
<
base_type
,
float
>::
value
||
is_same
<
base_type
,
half_t
>::
value
||
is_same
<
base_type
,
bhalf_t
>::
value
||
is_same
<
base_type
,
int8_t
>::
value
,
"base base_type must be float, half, bfloat16, and int8_t!"
);
static_assert
(
is_same
<
base_type
,
double
>::
value
||
is_same
<
base_type
,
float
>::
value
||
is_same
<
base_type
,
half_t
>::
value
||
is_same
<
base_type
,
bhalf_t
>::
value
||
is_same
<
base_type
,
int8_t
>::
value
,
"base base_type must be double, float, half, bfloat16, and int8_t!"
);
static_for
<
0
,
KPack
/
mfma_instr
.
k_per_blk
,
1
>
{}([
&
](
auto
k
)
{
mfma_instr
.
template
run
<
MPerXdlops
,
NPerXdlops
>(
p_a_wave
[
k
],
p_b_wave
[
k
],
p_c_thread
);
...
...
include/ck/utility/amd_xdlops.hpp
View file @
b2290854
...
...
@@ -294,5 +294,24 @@ struct intrin_mfma_i32_16x16x16i8<16, 16>
}
};
template
<
index_t
MPerWave
,
index_t
NPerWave
>
struct
intrin_mfma_f64_16x16x4f64
;
template
<
>
struct
intrin_mfma_f64_16x16x4f64
<
16
,
16
>
{
template
<
class
FloatC
>
__device__
static
void
Run
(
const
double
&
reg_a
,
const
double
&
reg_b
,
FloatC
&
reg_c
)
{
#ifdef __gfx90a__
reg_c
.
template
AsType
<
double4_t
>()(
Number
<
0
>
{})
=
__builtin_amdgcn_mfma_f64_16x16x4f64
(
reg_a
,
reg_b
,
reg_c
.
template
AsType
<
double4_t
>()[
Number
<
0
>
{}],
0
,
0
,
0
);
#else
ignore
=
reg_a
;
ignore
=
reg_b
;
ignore
=
reg_c
;
#endif
}
};
}
// namespace ck
#endif
include/ck/utility/generic_memory_space_atomic.hpp
View file @
b2290854
...
...
@@ -28,6 +28,12 @@ __device__ float atomic_add<float>(float* p_dst, const float& x)
return
atomicAdd
(
p_dst
,
x
);
}
template
<
>
__device__
double
atomic_add
<
double
>
(
double
*
p_dst
,
const
double
&
x
)
{
return
atomicAdd
(
p_dst
,
x
);
}
template
<
>
__device__
float2_t
atomic_add
<
float2_t
>
(
float2_t
*
p_dst
,
const
float2_t
&
x
)
{
...
...
@@ -45,6 +51,23 @@ __device__ float2_t atomic_add<float2_t>(float2_t* p_dst, const float2_t& x)
return
vy
.
template
AsType
<
float2_t
>()[
I0
];
}
template
<
>
__device__
double2_t
atomic_add
<
double2_t
>
(
double2_t
*
p_dst
,
const
double2_t
&
x
)
{
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
const
vector_type
<
double
,
2
>
vx
{
x
};
vector_type
<
double
,
2
>
vy
{
0
};
vy
.
template
AsType
<
double
>()(
I0
)
=
atomicAdd
(
c_style_pointer_cast
<
double
*>
(
p_dst
),
vx
.
template
AsType
<
double
>()[
I0
]);
vy
.
template
AsType
<
double
>()(
I1
)
=
atomicAdd
(
c_style_pointer_cast
<
double
*>
(
p_dst
)
+
1
,
vx
.
template
AsType
<
double
>()[
I1
]);
return
vy
.
template
AsType
<
double2_t
>()[
I0
];
}
// Caution: DO NOT REMOVE
// intentionally have only declaration but no definition to cause compilation failure when trying to
// instantiate this template. The purpose is to make the implementation of atomic_max explicit for
...
...
include/ck/utility/inner_product.hpp
View file @
b2290854
#ifndef CK_INNER_PRODUCT_HPP
#define CK_INNER_PRODUCT_HPP
#pragma once
#include "data_type.hpp"
namespace
ck
{
...
...
@@ -138,7 +136,7 @@ template <>
__device__
void
inner_product
<
int8x4_t
,
int8x4_t
,
int32_t
>
(
const
int8x4_t
&
a
,
const
int8x4_t
&
b
,
int32_t
&
c
)
{
#if defined(CK_USE_DOT4_I32_I8)
#if defined(CK_USE_
AMD_V_
DOT4_I32_I8)
#if CK_USE_AMD_INNER_PRODUCT_INLINE_ASM
asm
volatile
(
"
\n
\
v_dot4_i32_i8 %0, %1, %2, %0
\n
\
...
...
@@ -202,4 +200,3 @@ inner_product<int8x16_t, int8x16_t, int32_t>(const int8x16_t& a, const int8x16_t
}
}
// namespace ck
#endif
include/ck/utility/reduction_operator.hpp
View file @
b2290854
...
...
@@ -26,7 +26,8 @@
#ifndef CK_REDUCTION_OPERATOR_HPP
#define CK_REDUCTION_OPERATOR_HPP
#include "common_header.hpp"
#include "config.hpp"
#include "data_type.hpp"
namespace
ck
{
...
...
@@ -41,12 +42,10 @@ namespace reduce {
// when operated against them, and the concept is similar to zero vector in
// vector space
// (http://pages.cs.wisc.edu/~matthewb/pages/notes/pdf/linearalgebra/VectorSpaces.pdf).
// 2) indexable -- boolean value indicating whether indices of the operated elements could be
// recorded. Usually, Min/Max operator could
// need to record the indices of elements. For operator like Add/Mul, no need to
// record the indices.
// 3) operator() -- the first argument of the operator must be both an input & output, and the
// corresponding variable usually stores
// 2) IsCompatibleInMemoryDataOperation() -- return true if the reduction task corresponding to this
// operator can use the InMemoryDataOperation to finalize, or else it return false 3) operator() --
// the first argument of the operator must be both an input & output, and the corresponding variable
// usually stores
// the accumulated result of many operator() calls; the second argument is only an
// input. For indexable binary
// operator, the second version of operator() has third argument (which is an
...
...
@@ -62,6 +61,13 @@ struct Add
__host__
__device__
static
constexpr
T
GetReductionZeroVal
()
{
return
static_cast
<
T
>
(
0.0
f
);
};
__device__
static
constexpr
bool
IsCompatibleInMemoryDataOperation
(
InMemoryDataOperationEnum
operation
)
{
return
operation
==
InMemoryDataOperationEnum
::
AtomicAdd
||
operation
==
InMemoryDataOperationEnum
::
Set
;
};
__host__
__device__
inline
constexpr
void
operator
()(
T
&
a
,
T
b
)
const
{
a
=
a
+
b
;
}
};
...
...
@@ -72,6 +78,12 @@ struct Mul
__host__
__device__
static
constexpr
T
GetReductionZeroVal
()
{
return
static_cast
<
T
>
(
1.0
f
);
};
__device__
static
constexpr
bool
IsCompatibleInMemoryDataOperation
(
InMemoryDataOperationEnum
operation
)
{
return
operation
==
InMemoryDataOperationEnum
::
Set
;
};
__host__
__device__
inline
constexpr
void
operator
()(
T
&
a
,
T
b
)
const
{
a
=
a
*
b
;
}
};
...
...
@@ -85,6 +97,13 @@ struct Max
return
NumericLimits
<
T
>::
Lowest
();
};
__device__
static
constexpr
bool
IsCompatibleInMemoryDataOperation
(
InMemoryDataOperationEnum
operation
)
{
// ToChange: atomic_max to be added
return
operation
==
InMemoryDataOperationEnum
::
Set
;
};
__host__
__device__
inline
constexpr
void
operator
()(
T
&
a
,
T
b
)
const
{
if
(
a
<
b
)
...
...
@@ -111,6 +130,13 @@ struct Min
return
NumericLimits
<
T
>::
Max
();
};
__device__
static
constexpr
bool
IsCompatibleInMemoryDataOperation
(
InMemoryDataOperationEnum
operation
)
{
// ToChange: atomic_min to be added
return
operation
==
InMemoryDataOperationEnum
::
Set
;
};
__host__
__device__
inline
constexpr
void
operator
()(
T
&
a
,
T
b
)
const
{
if
(
a
>
b
)
...
...
@@ -134,6 +160,13 @@ struct AMax
__host__
__device__
static
constexpr
T
GetReductionZeroVal
()
{
return
static_cast
<
T
>
(
0.0
f
);
};
__device__
static
constexpr
bool
IsCompatibleInMemoryDataOperation
(
InMemoryDataOperationEnum
operation
)
{
// ToChange: atomic_max to be added
return
operation
==
InMemoryDataOperationEnum
::
Set
;
};
__host__
__device__
inline
constexpr
void
operator
()(
T
&
a
,
T
b
)
const
{
if
(
a
<
b
)
...
...
@@ -150,6 +183,17 @@ struct AMax
}
};
template
<
typename
T
>
T
GetReductionZeroValueForInMemoryDataOperation
(
InMemoryDataOperationEnum
operation
)
{
T
result
=
ck
::
type_convert
<
T
>
(
0.0
f
);
if
(
operation
==
InMemoryDataOperationEnum
::
AtomicMax
)
result
=
ck
::
NumericLimits
<
T
>::
Lowest
();
return
(
result
);
};
};
// end of namespace reduce
}
// end of namespace ck
...
...
include/ck/utility/static_buffer.hpp
View file @
b2290854
...
...
@@ -36,6 +36,11 @@ struct StaticBuffer : public StaticallyIndexedArray<T, N>
{
return
base
::
operator
()(
i
);
}
__host__
__device__
void
Clear
()
{
static_for
<
0
,
N
,
1
>
{}([
&
](
auto
i
)
{
operator
()(
i
)
=
T
{
0
};
});
}
};
// static buffer for vector
...
...
@@ -146,9 +151,9 @@ struct StaticBufferTupleOfVector
__host__
__device__
void
Clear
()
{
const
index_t
n
umScalars
=
NumOfVector
*
ScalarPerVector
;
const
expr
index_t
N
umScalars
=
NumOfVector
*
ScalarPerVector
;
static_for
<
0
,
Num
ber
<
num
Scalars
>
{}
,
1
>
{}([
&
](
auto
i
)
{
SetAsType
(
i
,
S
{
0
});
});
static_for
<
0
,
NumScalars
,
1
>
{}([
&
](
auto
i
)
{
SetAsType
(
i
,
S
{
0
});
});
}
};
...
...
library/include/ck/library/host_tensor/host_common_util.hpp
0 → 100644
View file @
b2290854
/*******************************************************************************
*
* MIT License
*
* Copyright (c) 2020 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*******************************************************************************/
#ifndef GUARD_HOST_COMMON_UTIL_HPP
#define GUARD_HOST_COMMON_UTIL_HPP
#include <vector>
#include <iostream>
#include <fstream>
#include <string>
#include "config.hpp"
namespace
ck
{
namespace
host_common
{
template
<
typename
T
>
static
inline
void
dumpBufferToFile
(
const
char
*
fileName
,
T
*
data
,
size_t
dataNumItems
)
{
std
::
ofstream
outFile
(
fileName
,
std
::
ios
::
binary
);
if
(
outFile
)
{
outFile
.
write
(
reinterpret_cast
<
char
*>
(
data
),
dataNumItems
*
sizeof
(
T
));
outFile
.
close
();
std
::
cout
<<
"Write output to file "
<<
fileName
<<
std
::
endl
;
}
else
{
std
::
cout
<<
"Could not open file "
<<
fileName
<<
" for writing"
<<
std
::
endl
;
}
};
template
<
typename
T
>
static
inline
T
getSingleValueFromString
(
const
std
::
string
&
valueStr
)
{
std
::
istringstream
iss
(
valueStr
);
T
val
;
iss
>>
val
;
return
(
val
);
};
template
<
typename
T
>
static
inline
std
::
vector
<
T
>
getTypeValuesFromString
(
const
char
*
cstr_values
)
{
std
::
string
valuesStr
(
cstr_values
);
std
::
vector
<
T
>
values
;
std
::
size_t
pos
=
0
;
std
::
size_t
new_pos
;
new_pos
=
valuesStr
.
find
(
','
,
pos
);
while
(
new_pos
!=
std
::
string
::
npos
)
{
const
std
::
string
sliceStr
=
valuesStr
.
substr
(
pos
,
new_pos
-
pos
);
T
val
=
getSingleValueFromString
<
T
>
(
sliceStr
);
values
.
push_back
(
val
);
pos
=
new_pos
+
1
;
new_pos
=
valuesStr
.
find
(
','
,
pos
);
};
std
::
string
sliceStr
=
valuesStr
.
substr
(
pos
);
T
val
=
getSingleValueFromString
<
T
>
(
sliceStr
);
values
.
push_back
(
val
);
return
(
values
);
}
};
// namespace host_common
};
// namespace ck
#endif
library/include/ck/library/host_tensor/host_reduce_util.hpp
View file @
b2290854
...
...
@@ -28,9 +28,7 @@
#include <limits>
#include <cmath>
#include <cassert>
#include <stdexcept>
#include <string>
#include <functional>
#include "reduction_enums.hpp"
#include "data_type.hpp"
...
...
@@ -214,13 +212,13 @@ binop_with_nan_check(std::function<void(AccDataType&, AccDataType)> opReduce,
};
};
template
<
typename
AccDataType
,
bool
PropagateNan
>
template
<
typename
AccDataType
,
typename
IndexDataType
,
bool
PropagateNan
>
__host__
static
inline
void
binop_with_nan_check
2
(
std
::
function
<
void
(
AccDataType
&
,
AccDataType
,
bool
&
)
>
opReduce
,
AccDataType
&
accuVal
,
AccDataType
currVal
,
int
&
accuIndex
,
int
currIndex
)
binop_with_
index_and_
nan_check
(
std
::
function
<
void
(
AccDataType
&
,
AccDataType
,
bool
&
)
>
opReduce
,
AccDataType
&
accuVal
,
AccDataType
currVal
,
IndexDataType
&
accuIndex
,
IndexDataType
currIndex
)
{
using
ck
::
math
::
isnan
;
...
...
@@ -254,16 +252,6 @@ binop_with_nan_check2(std::function<void(AccDataType&, AccDataType, bool&)> opRe
};
// namespace host_reduce
static
inline
std
::
vector
<
int
>
to_int_vector
(
const
std
::
vector
<
size_t
>&
inData
)
{
std
::
vector
<
int
>
outData
;
for
(
auto
elem
:
inData
)
outData
.
push_back
(
static_cast
<
int
>
(
elem
));
return
(
outData
);
};
};
// namespace ck
#endif
library/include/ck/library/host_tensor/host_reduction.hpp
View file @
b2290854
...
...
@@ -34,6 +34,7 @@
#include "reduction_enums.hpp"
#include "reduction_common.hpp"
#include "host_reduce_util.hpp"
#include "host_common_util.hpp"
#include "host_tensor.hpp"
#include "data_type.hpp"
...
...
@@ -200,7 +201,7 @@ struct ReductionHost
using
ck
::
float_equal_one
;
using
ck
::
float_equal_zero
;
using
ck
::
type_convert
;
using
ck
::
host_reduce
::
binop_with_nan_check
2
;
using
ck
::
host_reduce
::
binop_with_
index_and_
nan_check
;
using
ck
::
host_reduce
::
ReduceOpFn2
;
using
ck
::
host_reduce
::
ReduceOpZeroVal
;
...
...
@@ -211,8 +212,7 @@ struct ReductionHost
AccDataType
accuVal
=
ReduceOpZeroVal
<
AccDataType
,
ReduceOpId
>
();
IndexDataType
accuIndex
=
0
;
for
(
IndexDataType
i
=
0
;
i
<
ck
::
type_convert
<
IndexDataType
>
(
reduce_dim_indexes
.
size
());
i
++
)
for
(
std
::
size_t
i
=
0
;
i
<
reduce_dim_indexes
.
size
();
i
++
)
{
auto
offset_reduce
=
get_offset_from_index
<
NumReduceDim
>
(
reduceStrides
,
reduce_dim_indexes
[
i
]);
...
...
@@ -221,9 +221,9 @@ struct ReductionHost
preUnaryOp
(
currVal
);
auto
currIndex
=
i
;
auto
currIndex
=
static_cast
<
IndexDataType
>
(
i
)
;
binop_with_nan_check
2
<
AccDataType
,
PropagateNan
>
(
binop_with_
index_and_
nan_check
<
AccDataType
,
IndexDataType
,
PropagateNan
>
(
opReduce2
,
accuVal
,
currVal
,
accuIndex
,
currIndex
);
};
...
...
@@ -247,9 +247,7 @@ struct ReductionHost
auto
offset_invariant
=
get_offset_from_index
<
NumInvariantDim
>
(
invariantStrides
,
invariant_index
);
for
(
IndexDataType
i
=
0
;
i
<
ck
::
type_convert
<
IndexDataType
>
(
reduce_dim_indexes
.
size
());
i
++
)
for
(
std
::
size_t
i
=
0
;
i
<
reduce_dim_indexes
.
size
();
i
++
)
{
auto
offset_reduce
=
get_offset_from_index
<
NumReduceDim
>
(
reduceStrides
,
reduce_dim_indexes
[
i
]);
...
...
@@ -259,9 +257,9 @@ struct ReductionHost
preUnaryOp
(
currVal
);
auto
currIndex
=
i
;
auto
currIndex
=
static_cast
<
IndexDataType
>
(
i
)
;
binop_with_nan_check
2
<
AccDataType
,
PropagateNan
>
(
binop_with_
index_and_
nan_check
<
AccDataType
,
IndexDataType
,
PropagateNan
>
(
opReduce2
,
accuVal
,
currVal
,
accuIndex
,
currIndex
);
};
...
...
library/include/ck/library/reference_tensor_operation/cpu/reference_gemm.hpp
View file @
b2290854
...
...
@@ -11,6 +11,7 @@ namespace host {
template
<
typename
ADataType
,
typename
BDataType
,
typename
CDataType
,
typename
AccDataType
,
typename
AElementwiseOperation
,
typename
BElementwiseOperation
,
typename
CElementwiseOperation
>
...
...
@@ -53,20 +54,20 @@ struct ReferenceGemm : public device::BaseOperator
auto
f_mk_kn_mn
=
[
&
](
auto
m
,
auto
n
)
{
const
int
K
=
arg
.
a_m_k_
.
mDesc
.
GetLengths
()[
1
];
float
v_acc
=
0
;
AccDataType
v_acc
=
0
;
for
(
int
k
=
0
;
k
<
K
;
++
k
)
{
float
v_a
;
float
v_b
;
AccDataType
v_a
;
AccDataType
v_b
;
arg
.
a_element_op_
(
v_a
,
static_cast
<
const
float
>
(
arg
.
a_m_k_
(
m
,
k
)));
arg
.
b_element_op_
(
v_b
,
static_cast
<
const
float
>
(
arg
.
b_k_n_
(
k
,
n
)));
arg
.
a_element_op_
(
v_a
,
static_cast
<
const
AccDataType
>
(
arg
.
a_m_k_
(
m
,
k
)));
arg
.
b_element_op_
(
v_b
,
static_cast
<
const
AccDataType
>
(
arg
.
b_k_n_
(
k
,
n
)));
v_acc
+=
v_a
*
v_b
;
}
float
v_c
;
AccDataType
v_c
;
arg
.
c_element_op_
(
v_c
,
v_acc
);
...
...
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance.hpp
View file @
b2290854
...
...
@@ -9,26 +9,11 @@
#include "device_reduce_instance_blockwise_i8_i8_i8.hpp"
#include "device_reduce_instance_blockwise_i8_i32_i8.hpp"
#include "device_reduce_instance_blockwise_b16_f32_b16.hpp"
#include "device_reduce_instance_blockwise_second_call_f16_f16_f16.hpp"
#include "device_reduce_instance_blockwise_second_call_f32_f32_f16.hpp"
#include "device_reduce_instance_blockwise_second_call_f32_f32_f32.hpp"
#include "device_reduce_instance_blockwise_second_call_f64_f64_f32.hpp"
#include "device_reduce_instance_blockwise_second_call_f64_f64_f64.hpp"
#include "device_reduce_instance_blockwise_second_call_i8_i8_i8.hpp"
#include "device_reduce_instance_blockwise_second_call_i32_i32_i8.hpp"
#include "device_reduce_instance_blockwise_second_call_f32_f32_b16.hpp"
#include "device_reduce_instance_multiblock_atomic_add_f16_f32_f32.hpp"
#include "device_reduce_instance_multiblock_atomic_add_f32_f32_f32.hpp"
#include "device_reduce_instance_multiblock_atomic_add_f32_f64_f32.hpp"
#include "device_reduce_instance_multiblock_atomic_add_f64_f64_f64.hpp"
#include "device_reduce_instance_multiblock_atomic_add_b16_f32_f32.hpp"
#include "device_reduce_instance_multiblock_partial_reduce_f16_f16_f16.hpp"
#include "device_reduce_instance_multiblock_partial_reduce_f16_f32_f16.hpp"
#include "device_reduce_instance_multiblock_partial_reduce_f32_f32_f32.hpp"
#include "device_reduce_instance_multiblock_partial_reduce_f32_f64_f32.hpp"
#include "device_reduce_instance_multiblock_partial_reduce_f64_f64_f64.hpp"
#include "device_reduce_instance_multiblock_partial_reduce_i8_i8_i8.hpp"
#include "device_reduce_instance_multiblock_partial_reduce_i8_i32_i8.hpp"
#include "device_reduce_instance_multiblock_partial_reduce_b16_f32_b16.hpp"
#include "device_reduce_instance_threadwise_f16_f16_f16.hpp"
#include "device_reduce_instance_threadwise_f16_f32_f16.hpp"
#include "device_reduce_instance_threadwise_f32_f32_f32.hpp"
...
...
library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp
View file @
b2290854
...
...
@@ -3,13 +3,27 @@
#include "reduction_operator_mapping.hpp"
#include "device_reduce_instance_impl_common.hpp"
#include "device_reduce_block
wise
.hpp"
#include "device_reduce_
multi
block.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
device_reduce_instance
{
using
reduce_configuration_1_instances_blockwise
=
std
::
tuple
<
// clang-format off
// BlockSize | MThreadClusterSize | KThreadClusterSize
ReductionConfiguration_1
<
256
,
128
,
2
>
,
ReductionConfiguration_1
<
256
,
64
,
4
>
,
ReductionConfiguration_1
<
256
,
32
,
8
>
,
ReductionConfiguration_1
<
256
,
16
,
16
>
,
ReductionConfiguration_1
<
256
,
8
,
32
>
,
ReductionConfiguration_1
<
256
,
4
,
64
>
,
ReductionConfiguration_1
<
256
,
2
,
128
>
,
ReductionConfiguration_1
<
256
,
1
,
256
>
// clang-format on
>
;
#ifdef QUICK_REDUCE_TEST
using
reduce_configuration_2_instances_blockwise
=
std
::
tuple
<
// clang-format off
...
...
@@ -58,8 +72,8 @@ template <typename InDataType,
int
Rank
,
int
NumReduceDim
,
ReduceTensorOp
ReduceOpId
,
Nan
Propagat
ion
NanOpt
,
ReduceTensorIndices
IndicesOpt
>
bool
Propagat
eNan
,
bool
UseIndex
>
void
add_device_reduce_instance_blockwise
(
std
::
vector
<
deviceReduceBlockWisePtrType
<
AccDataType
,
ReduceOpId
>>&
device_op_instances
)
{
...
...
@@ -73,92 +87,94 @@ void add_device_reduce_instance_blockwise(
constexpr
bool
Indexable
=
(
ReduceOpId
==
ReduceTensorOp
::
MIN
||
ReduceOpId
==
ReduceTensorOp
::
MAX
||
ReduceOpId
==
ReduceTensorOp
::
AMAX
);
constexpr
bool
NeedIndices
=
Indexable
&&
(
IndicesOpt
!=
ReduceTensorIndices
::
NO_INDICES
);
constexpr
bool
PropagateNan
=
(
NanOpt
==
NanPropagation
::
NOT_PROPAGATE_NAN
)
?
false
:
true
;
static_for
<
0
,
std
::
tuple_size
<
reduce_configuration_1_instances
>::
value
,
1
>
{}([
&
](
auto
i
)
{
using
cfg1
=
remove_cvref_t
<
decltype
(
std
::
get
<
i
.
value
>
(
reduce_configuration_1_instances
{}))
>
;
static_for
<
0
,
std
::
tuple_size
<
reduce_configuration_2_instances_blockwise
>::
value
,
1
>
{}(
[
&
](
auto
j
)
{
using
cfg2
=
remove_cvref_t
<
decltype
(
std
::
get
<
j
.
value
>
(
reduce_configuration_2_instances_blockwise
{}))
>
;
using
ReduceOpInstance
=
DeviceReduceBlockWise
<
InDataType
,
AccDataType
,
OutDataType
,
Rank
,
NumReduceDim
,
ReduceOperation
,
InElementwiseOperation
,
AccElementwiseOperation
,
PropagateNan
,
NeedIndices
,
cfg1
::
BlockSize_
,
cfg1
::
MThreadClusterSize_
,
cfg1
::
KThreadClusterSize_
,
cfg2
::
MThreadSliceSize_
,
cfg2
::
KThreadSliceSize_
,
cfg2
::
InSrcVectorDim_
,
cfg2
::
InSrcVectorSize_
,
cfg2
::
OutDstVectorSize_
>
;
device_op_instances
.
push_back
(
std
::
make_unique
<
ReduceOpInstance
>
(
ReduceOpInstance
{}));
});
});
constexpr
bool
OutputIndex
=
Indexable
&&
UseIndex
;
static_for
<
0
,
std
::
tuple_size
<
reduce_configuration_1_instances_blockwise
>::
value
,
1
>
{}(
[
&
](
auto
i
)
{
using
cfg1
=
remove_cvref_t
<
decltype
(
std
::
get
<
i
.
value
>
(
reduce_configuration_1_instances_blockwise
{}))
>
;
static_for
<
0
,
std
::
tuple_size
<
reduce_configuration_2_instances_blockwise
>::
value
,
1
>
{}(
[
&
](
auto
j
)
{
using
cfg2
=
remove_cvref_t
<
decltype
(
std
::
get
<
j
.
value
>
(
reduce_configuration_2_instances_blockwise
{}))
>
;
using
ReduceOpInstance
=
DeviceReduceMultiBlock
<
InDataType
,
AccDataType
,
OutDataType
,
Rank
,
NumReduceDim
,
ReduceOperation
,
InElementwiseOperation
,
AccElementwiseOperation
,
InMemoryDataOperationEnum
::
Set
,
PropagateNan
,
OutputIndex
,
false
,
// HaveIndexInputIfOutputIndex
cfg1
::
BlockSize_
,
cfg1
::
MThreadClusterSize_
,
cfg1
::
KThreadClusterSize_
,
cfg2
::
MThreadSliceSize_
,
cfg2
::
KThreadSliceSize_
,
cfg2
::
InSrcVectorDim_
,
cfg2
::
InSrcVectorSize_
,
cfg2
::
OutDstVectorSize_
>
;
device_op_instances
.
push_back
(
std
::
make_unique
<
ReduceOpInstance
>
(
ReduceOpInstance
{}));
});
});
};
#define ADD_BLOCKWISE_INST_BY_TYPE( \
inT, compT, outT, ReduceOpId,
NanOpt, IndicesOpt
, Rank, NumReduceDim) \
template void add_device_reduce_instance_blockwise<inT, \
compT, \
outT, \
Rank, \
NumReduceDim, \
ReduceOpId, \
NanOpt,
\
IndicesOpt>(
\
#define ADD_BLOCKWISE_INST_BY_TYPE(
\
inT, compT, outT, ReduceOpId,
PropagateNan, UseIndex
, Rank, NumReduceDim) \
template void add_device_reduce_instance_blockwise<inT,
\
compT,
\
outT,
\
Rank,
\
NumReduceDim,
\
ReduceOpId,
\
PropagateNan,
\
UseIndex>(
\
std::vector<deviceReduceBlockWisePtrType<compT, ReduceOpId>> & device_op_instances)
#define ADD_BLOCKWISE_INST_BY_ID(
\
inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim)
\
ADD_BLOCKWISE_INST_BY_TYPE(inT,
\
compT,
\
outT,
\
static_cast<ReduceTensorOp>(ReduceOpId),
\
static_cast<
NanPropagation
>(NanOpt), \
static_cast<
ReduceTensorIndices
>(IndicesOpt), \
Rank,
\
#define ADD_BLOCKWISE_INST_BY_ID( \
inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim) \
ADD_BLOCKWISE_INST_BY_TYPE(inT, \
compT, \
outT, \
static_cast<ReduceTensorOp>(ReduceOpId), \
static_cast<
bool
>(NanOpt),
\
static_cast<
bool
>(IndicesOpt),
\
Rank, \
NumReduceDim)
#define ADD_BLOCKWISE_INST_REF_BY_TYPE( \
inT, compT, outT, ReduceOpId,
NanOpt, IndicesOpt
, Rank, NumReduceDim)
\
inT, compT, outT, ReduceOpId,
PropagateNan, UseIndex
, Rank, NumReduceDim) \
extern template void add_device_reduce_instance_blockwise<inT, \
compT, \
outT, \
Rank, \
NumReduceDim, \
ReduceOpId, \
NanOpt,
\
IndicesOpt>(
\
PropagateNan,
\
UseIndex>(
\
std::vector<DeviceReducePtr< \
typename reduce_unary_operator<compT, ReduceOpId, true, true>::InElementwiseOperation, \
typename reduce_unary_operator<compT, ReduceOpId, true, true>:: \
AccElementwiseOperation>> & \
device_op_instances)
#define ADD_BLOCKWISE_INST_REF_BY_ID(
\
inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim)
\
ADD_BLOCKWISE_INST_REF_BY_TYPE(inT,
\
compT,
\
outT,
\
static_cast<ReduceTensorOp>(ReduceOpId),
\
static_cast<
NanPropagation
>(NanOpt), \
static_cast<
ReduceTensorIndices
>(IndicesOpt), \
Rank,
\
#define ADD_BLOCKWISE_INST_REF_BY_ID( \
inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim) \
ADD_BLOCKWISE_INST_REF_BY_TYPE(inT, \
compT, \
outT, \
static_cast<ReduceTensorOp>(ReduceOpId), \
static_cast<
bool
>(NanOpt),
\
static_cast<
bool
>(IndicesOpt),
\
Rank, \
NumReduceDim)
}
// namespace device_reduce_instance
...
...
Prev
1
2
3
4
5
6
7
8
…
11
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment