Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel_ROCM
Commits
705d5a08
Commit
705d5a08
authored
Sep 08, 2023
by
Jun Liu
Browse files
Merge branch 'develop' into amd-develop
parents
d4ad52d6
8f84a012
Changes
168
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
1273 additions
and
177 deletions
+1273
-177
include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_streamk.hpp
...ensor_operation/gpu/grid/gridwise_gemm_xdlops_streamk.hpp
+2
-1
include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r3.hpp
...k/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r3.hpp
+41
-4
include/ck/tensor_operation/gpu/grid/gridwise_image_to_column.hpp
...ck/tensor_operation/gpu/grid/gridwise_image_to_column.hpp
+97
-0
include/ck/tensor_operation/gpu/thread/threadwise_contraction_dl_dpp8.hpp
...r_operation/gpu/thread/threadwise_contraction_dl_dpp8.hpp
+0
-136
include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v6r1.hpp
...tion/gpu/thread/threadwise_tensor_slice_transfer_v6r1.hpp
+2
-2
include/ck/tensor_operation/gpu/warp/dpp_gemm.hpp
include/ck/tensor_operation/gpu/warp/dpp_gemm.hpp
+322
-0
include/ck/tensor_operation/operator_transform/transform_conv_bwd_data_to_gemm_v1.hpp
...operator_transform/transform_conv_bwd_data_to_gemm_v1.hpp
+18
-15
include/ck/utility/amd_gemm_dpp.hpp
include/ck/utility/amd_gemm_dpp.hpp
+51
-5
include/ck/utility/inner_product_dpp8.hpp
include/ck/utility/inner_product_dpp8.hpp
+4
-0
include/ck/utility/loop_scheduler.hpp
include/ck/utility/loop_scheduler.hpp
+26
-0
include/ck/utility/reduction_operator.hpp
include/ck/utility/reduction_operator.hpp
+60
-1
library/include/ck/library/reference_tensor_operation/cpu/reference_gemm.hpp
...library/reference_tensor_operation/cpu/reference_gemm.hpp
+2
-2
library/include/ck/library/reference_tensor_operation/cpu/reference_image_to_column.hpp
...erence_tensor_operation/cpu/reference_image_to_column.hpp
+325
-0
library/include/ck/library/reference_tensor_operation/cpu/reference_maxpool_bwd.hpp
.../reference_tensor_operation/cpu/reference_maxpool_bwd.hpp
+10
-1
library/include/ck/library/reference_tensor_operation/cpu/reference_pool_fwd.hpp
...ary/reference_tensor_operation/cpu/reference_pool_fwd.hpp
+4
-2
library/include/ck/library/tensor_operation_instance/device_operation_instance_factory.hpp
..._operation_instance/device_operation_instance_factory.hpp
+5
-0
library/include/ck/library/tensor_operation_instance/gpu/avg_pool3d_bwd.hpp
.../library/tensor_operation_instance/gpu/avg_pool3d_bwd.hpp
+59
-0
library/include/ck/library/tensor_operation_instance/gpu/gemm.hpp
...include/ck/library/tensor_operation_instance/gpu/gemm.hpp
+8
-8
library/include/ck/library/tensor_operation_instance/gpu/gemm_bilinear.hpp
...k/library/tensor_operation_instance/gpu/gemm_bilinear.hpp
+76
-0
library/include/ck/library/tensor_operation_instance/gpu/gemm_multiply_add.hpp
...brary/tensor_operation_instance/gpu/gemm_multiply_add.hpp
+161
-0
No files found.
include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_streamk.hpp
View file @
705d5a08
...
@@ -37,7 +37,8 @@ __global__ void
...
@@ -37,7 +37,8 @@ __global__ void
index_t
StrideC
,
index_t
StrideC
,
typename
GridwiseGemm
::
Block2CTileMap
block_mapping
)
typename
GridwiseGemm
::
Block2CTileMap
block_mapping
)
{
{
#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__))
#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__))
constexpr
index_t
shared_size
=
GridwiseGemm
::
GetSharedMemoryNumberOfByte
();
constexpr
index_t
shared_size
=
GridwiseGemm
::
GetSharedMemoryNumberOfByte
();
__shared__
uint8_t
p_shared
[
shared_size
];
__shared__
uint8_t
p_shared
[
shared_size
];
...
...
include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r3.hpp
View file @
705d5a08
...
@@ -194,7 +194,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3
...
@@ -194,7 +194,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3
StrideC
{
StrideC_
},
StrideC
{
StrideC_
},
MPadded
{
CalculateMPadded
(
M_
)},
MPadded
{
CalculateMPadded
(
M_
)},
NPadded
{
CalculateNPadded
(
N_
)},
NPadded
{
CalculateNPadded
(
N_
)},
K0
{
CalculateK0
(
K
)}
K0
{
CalculateK0
(
K
_
)}
{
{
}
}
...
@@ -383,7 +383,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3
...
@@ -383,7 +383,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3
__host__
static
constexpr
bool
CalculateHasMainKBlockLoop
(
index_t
K
)
__host__
static
constexpr
bool
CalculateHasMainKBlockLoop
(
index_t
K
)
{
{
const
index_t
num_loop
=
K
/
(
K0PerBlock
*
K1
);
const
index_t
num_loop
=
math
::
integer_divide_ceil
(
K
,
K0PerBlock
*
K1
);
return
GridwiseGemmPipe
::
CalculateHasMainLoop
(
num_loop
);
return
GridwiseGemmPipe
::
CalculateHasMainLoop
(
num_loop
);
}
}
...
@@ -840,7 +840,25 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3_ext
...
@@ -840,7 +840,25 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3_ext
}
}
}();
}();
if
constexpr
(
GemmSpec
==
tensor_operation
::
device
::
GemmSpecialization
::
MNPadding
)
if
constexpr
(
GemmSpec
==
tensor_operation
::
device
::
GemmSpecialization
::
MNKPadding
)
{
const
auto
K0Pad
=
math
::
integer_divide_ceil
(
K0
,
K0PerBlock
)
*
K0PerBlock
;
const
auto
KPad
=
K0Pad
*
K1Value
;
const
auto
a_grid_desc_m_kpad
=
transform_tensor_descriptor
(
a_grid_desc_m_k
,
make_tuple
(
make_pass_through_transform
(
M
),
make_right_pad_transform
(
K
,
KPad
-
K
)),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}));
return
transform_tensor_descriptor
(
a_grid_desc_m_kpad
,
make_tuple
(
make_unmerge_transform
(
make_tuple
(
K0Pad
,
K1Value
)),
make_right_pad_transform
(
M
,
MPad
-
M
)),
make_tuple
(
Sequence
<
1
>
{},
Sequence
<
0
>
{}),
make_tuple
(
Sequence
<
0
,
2
>
{},
Sequence
<
1
>
{}));
}
else
if
constexpr
(
GemmSpec
==
tensor_operation
::
device
::
GemmSpecialization
::
MNPadding
)
{
{
return
transform_tensor_descriptor
(
return
transform_tensor_descriptor
(
a_grid_desc_m_k
,
a_grid_desc_m_k
,
...
@@ -874,7 +892,26 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3_ext
...
@@ -874,7 +892,26 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3_ext
}
}
}();
}();
if
constexpr
(
GemmSpec
==
tensor_operation
::
device
::
GemmSpecialization
::
MNPadding
)
if
constexpr
(
GemmSpec
==
tensor_operation
::
device
::
GemmSpecialization
::
MNKPadding
)
{
const
auto
K0Pad
=
math
::
integer_divide_ceil
(
K0
,
K0PerBlock
)
*
K0PerBlock
;
const
auto
KPad
=
K0Pad
*
K1Value
;
const
auto
b_grid_desc_kpad_n
=
transform_tensor_descriptor
(
b_grid_desc_k_n
,
make_tuple
(
make_right_pad_transform
(
K
,
KPad
-
K
),
make_pass_through_transform
(
N
)),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}));
return
transform_tensor_descriptor
(
b_grid_desc_kpad_n
,
make_tuple
(
make_unmerge_transform
(
make_tuple
(
K0Pad
,
K1Value
)),
make_right_pad_transform
(
N
,
NPad
-
N
)),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}),
make_tuple
(
Sequence
<
0
,
2
>
{},
Sequence
<
1
>
{}));
}
else
if
constexpr
(
GemmSpec
==
tensor_operation
::
device
::
GemmSpecialization
::
MNPadding
)
{
{
return
transform_tensor_descriptor
(
return
transform_tensor_descriptor
(
b_grid_desc_k_n
,
b_grid_desc_k_n
,
...
...
include/ck/tensor_operation/gpu/grid/gridwise_image_to_column.hpp
0 → 100644
View file @
705d5a08
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include "ck/utility/common_header.hpp"
#include "ck/tensor_description/multi_index_transform_helper.hpp"
#include "ck/tensor_description/tensor_descriptor.hpp"
#include "ck/tensor_description/tensor_descriptor_helper.hpp"
#include "ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp"
#include "ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_selector.hpp"
#include "ck/tensor_operation/gpu/block/blockwise_gemm_xdlops.hpp"
#include "ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v7.hpp"
#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
namespace
ck
{
template
<
typename
InputGridDesc
,
typename
InputDataType
,
typename
OutputGridDesc
,
typename
OutputDataType
,
index_t
BlockSize
,
index_t
MPerBlock
,
index_t
KPerBlock
,
typename
ThreadClusterLengths
,
index_t
ScalarPerVector
,
typename
Block2ETileMap
>
struct
GridwiseImageToColumn
{
static
constexpr
auto
I0
=
Number
<
0
>
{};
static
constexpr
auto
I1
=
Number
<
1
>
{};
using
ThisThreadBlock
=
ThisThreadBlock
<
BlockSize
>
;
__device__
static
void
Run
(
const
InputGridDesc
&
in_grid_desc
,
const
InputDataType
*
__restrict__
p_in_global
,
const
OutputGridDesc
&
out_grid_desc
,
OutputDataType
*
__restrict__
p_out_global
,
const
Block2ETileMap
&
block_2_tile_map
)
{
const
auto
block_work_idx
=
block_2_tile_map
.
CalculateBottomIndex
(
make_multi_index
(
get_block_1d_id
()));
const
index_t
m_block_data_idx_on_grid
=
__builtin_amdgcn_readfirstlane
(
block_work_idx
[
I0
]
*
MPerBlock
);
const
index_t
k_block_data_idx_on_grid
=
__builtin_amdgcn_readfirstlane
(
block_work_idx
[
I1
]
*
KPerBlock
);
// Global Memory
const
auto
in_global_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
::
Global
>
(
p_in_global
,
in_grid_desc
.
GetElementSpaceSize
());
auto
out_global_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
::
Global
>
(
p_out_global
,
out_grid_desc
.
GetElementSpaceSize
());
auto
copy_global_to_global
=
ThreadGroupTensorSliceTransfer_v7
<
ThisThreadBlock
,
Tuple
<
InputDataType
>
,
Tuple
<
OutputDataType
>
,
decltype
(
tie
(
in_grid_desc
)),
decltype
(
tie
(
out_grid_desc
)),
tensor_operation
::
element_wise
::
PassThrough
,
Sequence
<
static_cast
<
index_t
>
(
InMemoryDataOperationEnum
::
Set
)
>
,
Sequence
<
MPerBlock
,
KPerBlock
>
,
ThreadClusterLengths
,
Sequence
<
0
,
1
>
,
Sequence
<
0
,
1
>
,
I1
,
ScalarPerVector
,
Sequence
<
true
>
,
Sequence
<
true
>>
{
in_grid_desc
,
make_tuple
(
make_multi_index
(
m_block_data_idx_on_grid
,
k_block_data_idx_on_grid
)),
out_grid_desc
,
make_tuple
(
make_multi_index
(
m_block_data_idx_on_grid
,
k_block_data_idx_on_grid
)),
tensor_operation
::
element_wise
::
PassThrough
{}};
copy_global_to_global
.
Run
(
tie
(
in_grid_desc
),
tie
(
in_global_buf
),
tie
(
out_grid_desc
),
tie
(
out_global_buf
));
}
__host__
static
constexpr
bool
CheckValidity
(
const
InputGridDesc
&
in_grid_desc
,
const
OutputGridDesc
&
out_grid_desc
)
{
if
(
in_grid_desc
.
GetLength
(
I0
)
%
MPerBlock
!=
0
||
in_grid_desc
.
GetLength
(
I1
)
%
KPerBlock
!=
0
)
return
false
;
if
(
out_grid_desc
.
GetLength
(
I0
)
%
MPerBlock
!=
0
||
out_grid_desc
.
GetLength
(
I1
)
%
KPerBlock
!=
0
)
return
false
;
return
true
;
}
};
}
// namespace ck
include/ck/tensor_operation/gpu/thread/threadwise_contraction_dl_dpp8.hpp
deleted
100644 → 0
View file @
d4ad52d6
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include "ck/utility/amd_gemm_dpp.hpp"
#include "ck/utility/common_header.hpp"
#include "ck/utility/inner_product_dpp8.hpp"
#include "ck/utility/math.hpp"
namespace
ck
{
/**
* Threadwise contraction using dot instructions with DPP8 modifier.
*
* Assumptions:
* 1. `AThreadDesc_TK0_TM0_TM1_TK1`, `BThreadDesc_TK0_TN0_TN1_TK1`, `CThreadDesc_TM0_TM1_TN0_TN1`
* are known at compile-time;
* 2. `AOriginIdx`, `BOriginIdx`, `COriginIdx` are known at compile-time;
* 3. `TM0` is equal to 1 and `TN0` is equal to 1;
* 4. When `ShareA` is set (unset, respectively), `TM1` (`TN1`, respectively) is divisible by
* the size of the lane group (`dpp8::lane_group_size`).
*/
template
<
typename
FloatA
,
typename
FloatB
,
typename
FloatC
,
typename
AThreadDesc_TK0_TM0_TM1_TK1
,
typename
BThreadDesc_TK0_TN0_TN1_TK1
,
typename
CThreadDesc_TM0_TM1_TN0_TN1
,
typename
TKLengths
,
typename
TMLengths
,
typename
TNLengths
,
bool
ShareA
,
typename
enable_if
<
AThreadDesc_TK0_TM0_TM1_TK1
::
IsKnownAtCompileTime
()
&&
BThreadDesc_TK0_TN0_TN1_TK1
::
IsKnownAtCompileTime
()
&&
CThreadDesc_TM0_TM1_TN0_TN1
::
IsKnownAtCompileTime
(),
bool
>
::
type
=
false
>
struct
ThreadwiseContractionDlDpp8_A_TK0_TM0_TM1_TK1_B_TK0_TN0_TN1_TK1_C_TM0_TM1_TN0_TN1
{
static
constexpr
auto
I0
=
Number
<
0
>
{};
static
constexpr
auto
I1
=
Number
<
1
>
{};
static
constexpr
index_t
TK0
=
TKLengths
{}[
I0
];
static
constexpr
index_t
TK1
=
TKLengths
{}[
I1
];
static
constexpr
index_t
TM0
=
TMLengths
{}[
I0
];
static
constexpr
index_t
TM1
=
TMLengths
{}[
I1
];
static
constexpr
index_t
TN0
=
TNLengths
{}[
I0
];
static
constexpr
index_t
TN1
=
TNLengths
{}[
I1
];
static_assert
(
TM0
==
1
&&
TN0
==
1
);
static_assert
((
ShareA
&&
TM1
%
dpp8
::
lane_group_size
==
0
)
||
(
!
ShareA
&&
TN1
%
dpp8
::
lane_group_size
==
0
));
static
constexpr
index_t
shared_elems_per_lane
=
ShareA
?
TM1
/
dpp8
::
lane_group_size
:
TN1
/
dpp8
::
lane_group_size
;
__device__
constexpr
ThreadwiseContractionDlDpp8_A_TK0_TM0_TM1_TK1_B_TK0_TN0_TN1_TK1_C_TM0_TM1_TN0_TN1
()
{
static_assert
(
AThreadDesc_TK0_TM0_TM1_TK1
::
IsKnownAtCompileTime
()
&&
BThreadDesc_TK0_TN0_TN1_TK1
::
IsKnownAtCompileTime
()
&&
CThreadDesc_TM0_TM1_TN0_TN1
::
IsKnownAtCompileTime
(),
"wrong! Desc should be known at compile-time"
);
static_assert
(
TKLengths
::
Size
()
==
2
&&
TMLengths
::
Size
()
==
2
&&
TNLengths
::
Size
()
==
2
,
"wrong!"
);
}
template
<
typename
ABuffer
,
typename
AOriginIdx
,
typename
BBuffer
,
typename
BOriginIdx
,
typename
CBuffer
,
typename
COriginIdx
>
__device__
static
void
Run
(
const
ABuffer
&
a_buf
,
AOriginIdx
,
const
BBuffer
&
b_buf
,
BOriginIdx
,
CBuffer
&
c_buf
,
COriginIdx
)
{
static_assert
(
is_known_at_compile_time
<
remove_cvref_t
<
AOriginIdx
>>::
value
&&
is_known_at_compile_time
<
remove_cvref_t
<
BOriginIdx
>>::
value
&&
is_known_at_compile_time
<
remove_cvref_t
<
COriginIdx
>>::
value
,
"wrong! AOriginIdx, BOriginIdx, COringinIdx should be known at compile-time"
);
static_assert
(
is_same
<
remove_cvref_t
<
typename
ABuffer
::
type
>
,
remove_cvref_t
<
FloatA
>>::
value
&&
is_same
<
remove_cvref_t
<
typename
BBuffer
::
type
>
,
remove_cvref_t
<
FloatB
>>::
value
&&
is_same
<
remove_cvref_t
<
typename
CBuffer
::
type
>
,
remove_cvref_t
<
FloatC
>>::
value
&&
"wrong! inconsistent type"
);
constexpr
auto
a_origin_idx
=
to_multi_index
(
AOriginIdx
{});
constexpr
auto
b_origin_idx
=
to_multi_index
(
BOriginIdx
{});
constexpr
auto
c_origin_idx
=
to_multi_index
(
COriginIdx
{});
static_for
<
0
,
TK0
,
1
>
{}([
&
](
auto
tk0
)
{
static_for
<
0
,
TM1
,
1
>
{}([
&
](
auto
tm1
)
{
static_for
<
0
,
TN1
,
1
>
{}([
&
](
auto
tn1
)
{
vector_type
<
FloatA
,
TK1
>
a_vec
;
vector_type
<
FloatB
,
TK1
>
b_vec
;
static_for
<
0
,
TK1
,
1
>
{}([
&
](
auto
tk1
)
{
constexpr
index_t
local_tm1
=
ShareA
?
tm1
%
shared_elems_per_lane
:
tm1
;
constexpr
index_t
a_offset
=
AThreadDesc_TK0_TM0_TM1_TK1
{}.
CalculateOffset
(
a_origin_idx
+
make_multi_index
(
tk0
,
0
,
local_tm1
,
tk1
));
constexpr
index_t
local_tn1
=
ShareA
?
tn1
:
tn1
%
shared_elems_per_lane
;
constexpr
index_t
b_offset
=
BThreadDesc_TK0_TN0_TN1_TK1
{}.
CalculateOffset
(
b_origin_idx
+
make_multi_index
(
tk0
,
0
,
local_tn1
,
tk1
));
a_vec
.
template
AsType
<
FloatA
>()(
tk1
)
=
a_buf
[
Number
<
a_offset
>
{}];
b_vec
.
template
AsType
<
FloatB
>()(
tk1
)
=
b_buf
[
Number
<
b_offset
>
{}];
});
using
a_vector_t
=
typename
vector_type
<
FloatA
,
TK1
>::
type
;
using
b_vector_t
=
typename
vector_type
<
FloatB
,
TK1
>::
type
;
constexpr
index_t
c_offset
=
CThreadDesc_TM0_TM1_TN0_TN1
{}.
CalculateOffset
(
c_origin_idx
+
make_multi_index
(
0
,
tm1
,
0
,
tn1
));
constexpr
int
src_lane
=
ShareA
?
(
tm1
/
shared_elems_per_lane
)
%
dpp8
::
lane_group_size
:
(
tn1
/
shared_elems_per_lane
)
%
dpp8
::
lane_group_size
;
dpp8
::
inner_product_dpp
<
a_vector_t
,
b_vector_t
,
FloatC
,
src_lane
,
ShareA
>
(
a_vec
.
template
AsType
<
a_vector_t
>()[
I0
],
b_vec
.
template
AsType
<
b_vector_t
>()[
I0
],
c_buf
(
Number
<
c_offset
>
{}));
});
});
});
}
};
}
// namespace ck
include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v6r1.hpp
View file @
705d5a08
...
@@ -104,13 +104,13 @@ struct ThreadwiseTensorSliceTransfer_v6r1
...
@@ -104,13 +104,13 @@ struct ThreadwiseTensorSliceTransfer_v6r1
// apply pointwise operation
// apply pointwise operation
static_for
<
0
,
ScalarPerVector
,
1
>
{}([
&
](
auto
i
)
{
static_for
<
0
,
ScalarPerVector
,
1
>
{}([
&
](
auto
i
)
{
Src
Data
v
;
Dst
Data
v
;
// apply element-wise operation
// apply element-wise operation
element_op_
(
v
,
src_vector_container
.
template
AsType
<
SrcData
>()[
i
]);
element_op_
(
v
,
src_vector_container
.
template
AsType
<
SrcData
>()[
i
]);
// apply type convert
// apply type convert
dst_vector_container
.
template
AsType
<
DstData
>()(
i
)
=
type_convert
<
DstData
>
(
v
)
;
dst_vector_container
.
template
AsType
<
DstData
>()(
i
)
=
v
;
});
});
const
bool
is_dst_valid
=
const
bool
is_dst_valid
=
...
...
include/ck/tensor_operation/gpu/warp/dpp_gemm.hpp
0 → 100644
View file @
705d5a08
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include "ck/utility/amd_gemm_dpp.hpp"
#include "ck/utility/common_header.hpp"
#include "ck/utility/math.hpp"
namespace
ck
{
enum
struct
DppInstr
{
dpp8_f16_16x16x2
=
0
,
dpp8_f16_8x32x2
,
dpp8_f16_32x8x2
};
/**
* Structure representing DPP GEMM executed by a single wavefront.
*
* Each structure instantiation must contain the following fields:
* - wave_size - number of threads that execute a single DPP GEMM operation, usually equal to the
* number of threads in a wavefront;
* - lanegroup_size - number of threads (lanes) that share data using DPP instruction modifier,
* it's 8 in case of DPP8;
* - m_per_wave - size along M dimension of matrix C that is processed in a single DPP GEMM
* operation;
* - n_per_wave - size along N dimension of matrix C that is processed in a single DPP GEMM
* operation;
* - m_per_lanegroup - size along M dimension that is processed by a single lanegroup;
* - n_per_lanegroup - size along N dimension that is processed by a single lanegroup;
* - m_per_thread - size along M dimension of the tile calculated by a single thread;
* - n_per_thread - size along N dimension of the tile calculated by a single thread;
* - k_per_dpp - size along K dimension that is reduced in a single DPP GEMM operation;
* - share_a - indicates whether we share matrix A or matrix B between lanes using DPP modifiers.
*
* Not all the combinarions are supported now, for current restrictions see the static asserts
* in the DppSelector's contructor.
*/
template
<
DppInstr
instr
>
struct
dpp_type
;
template
<
>
struct
dpp_type
<
DppInstr
::
dpp8_f16_32x8x2
>
{
static
constexpr
index_t
wave_size
=
32
;
static
constexpr
index_t
lanegroup_size
=
8
;
static
constexpr
index_t
m_per_wave
=
32
;
static
constexpr
index_t
n_per_wave
=
8
;
static
constexpr
index_t
m_per_lanegroup
=
8
;
static
constexpr
index_t
n_per_lanegroup
=
8
;
static
constexpr
index_t
m_per_thread
=
8
;
static
constexpr
index_t
n_per_thread
=
1
;
static
constexpr
index_t
k_per_dpp
=
2
;
static
constexpr
bool
share_a
=
true
;
using
BaseType
=
half_t
;
template
<
index_t
MPerDpp
,
index_t
NPerDpp
,
class
ADataType
,
class
BDataType
,
class
CDataType
>
__device__
void
run
(
const
ADataType
&
a
,
const
BDataType
&
b
,
CDataType
&
reg_c
)
const
{
dpp8
::
DppLanegroupGemm
<
m_per_thread
,
n_per_thread
,
k_per_dpp
,
BaseType
,
ADataType
,
BDataType
,
CDataType
,
share_a
>
{}
.
Run
(
a
,
b
,
reg_c
);
}
};
template
<
>
struct
dpp_type
<
DppInstr
::
dpp8_f16_8x32x2
>
{
static
constexpr
index_t
wave_size
=
32
;
static
constexpr
index_t
lanegroup_size
=
8
;
static
constexpr
index_t
m_per_wave
=
8
;
static
constexpr
index_t
n_per_wave
=
32
;
static
constexpr
index_t
m_per_lanegroup
=
8
;
static
constexpr
index_t
n_per_lanegroup
=
8
;
static
constexpr
index_t
m_per_thread
=
8
;
static
constexpr
index_t
n_per_thread
=
1
;
static
constexpr
index_t
k_per_dpp
=
2
;
static
constexpr
bool
share_a
=
true
;
using
BaseType
=
half_t
;
template
<
index_t
MPerDpp
,
index_t
NPerDpp
,
class
ADataType
,
class
BDataType
,
class
CDataType
>
__device__
void
run
(
const
ADataType
&
a
,
const
BDataType
&
b
,
CDataType
&
reg_c
)
const
{
dpp8
::
DppLanegroupGemm
<
m_per_thread
,
n_per_thread
,
k_per_dpp
,
BaseType
,
ADataType
,
BDataType
,
CDataType
,
share_a
>
{}
.
Run
(
a
,
b
,
reg_c
);
}
};
template
<
>
struct
dpp_type
<
DppInstr
::
dpp8_f16_16x16x2
>
{
static
constexpr
index_t
wave_size
=
32
;
static
constexpr
index_t
lanegroup_size
=
8
;
static
constexpr
index_t
m_per_wave
=
16
;
static
constexpr
index_t
n_per_wave
=
16
;
static
constexpr
index_t
m_per_lanegroup
=
8
;
static
constexpr
index_t
n_per_lanegroup
=
8
;
static
constexpr
index_t
m_per_thread
=
8
;
static
constexpr
index_t
n_per_thread
=
1
;
static
constexpr
index_t
k_per_dpp
=
2
;
static
constexpr
bool
share_a
=
true
;
using
BaseType
=
half_t
;
template
<
index_t
MPerDpp
,
index_t
NPerDpp
,
class
ADataType
,
class
BDataType
,
class
CDataType
>
__device__
void
run
(
const
ADataType
&
a
,
const
BDataType
&
b
,
CDataType
&
reg_c
)
const
{
dpp8
::
DppLanegroupGemm
<
m_per_thread
,
n_per_thread
,
k_per_dpp
,
BaseType
,
ADataType
,
BDataType
,
CDataType
,
share_a
>
{}
.
Run
(
a
,
b
,
reg_c
);
}
};
template
<
typename
BaseType
,
index_t
MPerDpp
,
index_t
NPerDpp
>
struct
DppSelector
{
template
<
typename
BaseType_
,
index_t
MPerDpp_
,
index_t
NPerDpp_
>
static
constexpr
auto
GetDpp
();
template
<
>
static
constexpr
auto
GetDpp
<
half_t
,
8
,
32
>
()
{
return
DppInstr
::
dpp8_f16_8x32x2
;
}
template
<
>
static
constexpr
auto
GetDpp
<
half_t
,
16
,
16
>
()
{
return
DppInstr
::
dpp8_f16_16x16x2
;
}
template
<
>
static
constexpr
auto
GetDpp
<
half_t
,
32
,
8
>
()
{
return
DppInstr
::
dpp8_f16_32x8x2
;
}
static
constexpr
auto
selected_dpp
=
dpp_type
<
GetDpp
<
BaseType
,
MPerDpp
,
NPerDpp
>
()
>
{};
__host__
__device__
constexpr
DppSelector
()
{
static_assert
(
selected_dpp
.
m_per_wave
%
selected_dpp
.
m_per_lanegroup
==
0
);
static_assert
(
selected_dpp
.
n_per_wave
%
selected_dpp
.
n_per_lanegroup
==
0
);
static_assert
(
selected_dpp
.
k_per_dpp
%
2
==
0
);
static_assert
(
selected_dpp
.
wave_size
%
selected_dpp
.
lanegroup_size
==
0
);
constexpr
index_t
num_dpp_per_wave
=
selected_dpp
.
wave_size
/
selected_dpp
.
lanegroup_size
;
constexpr
index_t
num_wave_c_elems
=
selected_dpp
.
m_per_wave
*
selected_dpp
.
n_per_wave
;
constexpr
index_t
num_dpp_c_elems
=
selected_dpp
.
m_per_lanegroup
*
selected_dpp
.
n_per_lanegroup
;
static_assert
(
num_wave_c_elems
%
num_dpp_c_elems
==
0
);
static_assert
(
num_dpp_per_wave
==
num_wave_c_elems
/
num_dpp_c_elems
);
if
constexpr
(
selected_dpp
.
share_a
)
{
static_assert
(
selected_dpp
.
m_per_lanegroup
==
selected_dpp
.
m_per_thread
);
static_assert
(
selected_dpp
.
n_per_lanegroup
%
selected_dpp
.
n_per_thread
==
0
);
static_assert
(
selected_dpp
.
n_per_lanegroup
/
selected_dpp
.
n_per_thread
==
selected_dpp
.
lanegroup_size
);
}
else
{
static_assert
(
selected_dpp
.
m_per_lanegroup
%
selected_dpp
.
n_per_thread
==
0
);
static_assert
(
selected_dpp
.
m_per_lanegroup
/
selected_dpp
.
n_per_thread
==
selected_dpp
.
lanegroup_size
);
static_assert
(
selected_dpp
.
n_per_lanegroup
==
selected_dpp
.
n_per_thread
);
}
// Below checks come from the restrictions of the current implementation, could be removed
// in the future when the implementation is more generalized.
static_assert
(
selected_dpp
.
share_a
);
static_assert
(
selected_dpp
.
n_per_thread
==
1
);
static_assert
(
selected_dpp
.
m_per_thread
==
selected_dpp
.
lanegroup_size
);
static_assert
(
selected_dpp
.
m_per_lanegroup
==
selected_dpp
.
m_per_thread
);
static_assert
(
selected_dpp
.
n_per_lanegroup
==
selected_dpp
.
n_per_thread
*
selected_dpp
.
lanegroup_size
);
}
static
constexpr
index_t
GetK1PerDpp
()
{
return
selected_dpp
.
k_per_dpp
;
}
};
template
<
typename
BaseType
,
index_t
MPerDpp
,
index_t
NPerDpp
,
index_t
KPack
>
struct
DppGemm
{
static
constexpr
auto
I0
=
Number
<
0
>
{};
static
constexpr
auto
I1
=
Number
<
1
>
{};
static
constexpr
auto
I2
=
Number
<
2
>
{};
static
constexpr
auto
I3
=
Number
<
3
>
{};
static
constexpr
auto
I4
=
Number
<
4
>
{};
static
constexpr
auto
I5
=
Number
<
5
>
{};
using
CIndex
=
MultiIndex
<
2
>
;
using
CIndex4D
=
MultiIndex
<
4
>
;
__host__
__device__
constexpr
DppGemm
()
{
static_assert
(
MPerDpp
==
8
||
MPerDpp
==
16
||
MPerDpp
==
32
,
"MPerDpp must be either 8, 16 or 32."
);
static_assert
(
NPerDpp
==
8
||
NPerDpp
==
16
||
NPerDpp
==
32
,
"NPerDpp must be either 8, 16 or 32."
);
static_assert
(
KPack
%
dpp_instr
.
k_per_dpp
==
0
,
"KPack must be divisible by k_per_dpp."
);
}
__device__
static
constexpr
index_t
GetRegSizePerDpp
()
{
return
MPerDpp
*
NPerDpp
/
dpp_instr
.
wave_size
;
}
template
<
class
ADataType
,
class
BDataType
,
class
CDataType
>
__device__
void
Run
(
const
ADataType
&
p_a_wave
,
const
BDataType
&
p_b_wave
,
CDataType
&
p_c_thread
)
const
{
static_assert
(
is_same
<
BaseType
,
double
>::
value
||
is_same
<
BaseType
,
float
>::
value
||
is_same
<
BaseType
,
half_t
>::
value
||
is_same
<
BaseType
,
bhalf_t
>::
value
||
is_same
<
BaseType
,
int8_t
>::
value
||
is_same
<
BaseType
,
f8_t
>::
value
,
"base BaseType must be double, float, half, bfloat16, and int8_t!"
);
static_for
<
0
,
KPack
/
dpp_instr
.
k_per_dpp
,
1
>
{}([
&
](
auto
k
)
{
dpp_instr
.
template
run
<
MPerDpp
,
NPerDpp
>(
p_a_wave
[
k
],
p_b_wave
[
k
],
p_c_thread
);
});
}
__device__
static
auto
GetLaneIdInWave
()
{
return
get_thread_local_1d_id
()
%
dpp_instr
.
wave_size
;
}
__device__
static
auto
GetWaveId
()
{
return
get_thread_local_1d_id
()
/
dpp_instr
.
wave_size
;
}
__device__
static
auto
GetLaneIdInLaneGroup
()
{
return
get_thread_local_1d_id
()
%
dpp_instr
.
lanegroup_size
;
}
__device__
static
auto
GetLaneGroupIdInWave
()
{
return
GetLaneIdInWave
()
/
dpp_instr
.
lanegroup_size
;
}
__device__
static
auto
GetDppOpIdx
()
{
const
auto
lanegroupId
=
GetLaneGroupIdInWave
();
constexpr
auto
lanegroup_idx_1d_to_dpp_idx_2d_adaptor
=
make_single_stage_tensor_adaptor
(
make_tuple
(
make_merge_transform
(
make_tuple
(
dpp_instr
.
m_per_wave
/
dpp_instr
.
m_per_lanegroup
,
dpp_instr
.
n_per_wave
/
dpp_instr
.
n_per_lanegroup
))),
make_tuple
(
Sequence
<
0
,
1
>
{}),
make_tuple
(
Sequence
<
0
>
{}));
const
auto
dpp_idx
=
lanegroup_idx_1d_to_dpp_idx_2d_adaptor
.
CalculateBottomIndex
(
make_multi_index
(
lanegroupId
));
const
auto
m_dpp_idx
=
dpp_idx
[
I0
];
const
auto
n_dpp_idx
=
dpp_idx
[
I1
];
return
make_tuple
(
m_dpp_idx
,
n_dpp_idx
);
}
__host__
__device__
static
auto
CalculateAThreadOriginDataIndex_K_M
()
{
const
auto
laneId
=
get_thread_local_1d_id
();
const
auto
wave_row
=
laneId
/
dpp_instr
.
n_per_wave
;
auto
m_idx
=
dpp_instr
.
m_per_thread
*
wave_row
+
GetLaneIdInLaneGroup
();
return
make_tuple
(
0
,
m_idx
%
dpp_instr
.
m_per_wave
);
}
__host__
__device__
static
auto
CalculateBThreadOriginDataIndex_K_N
()
{
const
auto
laneId
=
get_thread_local_1d_id
();
return
make_tuple
(
0
,
laneId
%
dpp_instr
.
n_per_wave
);
}
__device__
static
CIndex
GetBeginOfThreadBlk
()
{
const
auto
dpp_op_idx
=
GetDppOpIdx
();
const
auto
m_dpp_op_idx
=
dpp_op_idx
[
I0
];
const
auto
n_dpp_op_idx
=
dpp_op_idx
[
I1
];
index_t
n_offset
=
n_dpp_op_idx
*
dpp_instr
.
n_per_lanegroup
+
GetLaneIdInLaneGroup
();
index_t
m_offset
=
m_dpp_op_idx
*
dpp_instr
.
m_per_lanegroup
;
return
CIndex
{
m_offset
,
n_offset
};
}
static
constexpr
auto
dpp
=
DppSelector
<
BaseType
,
MPerDpp
,
NPerDpp
>
{};
static
constexpr
auto
dpp_instr
=
dpp
.
selected_dpp
;
static
constexpr
auto
K0PerDpp
=
1
;
static
constexpr
auto
K1PerDpp
=
dpp
.
GetK1PerDpp
();
__host__
__device__
static
constexpr
auto
GetCMNThreadBlkLengths
()
{
return
make_tuple
(
Number
<
dpp_instr
.
m_per_thread
>
{},
Number
<
dpp_instr
.
n_per_thread
>
{});
}
};
}
// namespace ck
include/ck/tensor_operation/operator_transform/transform_conv_bwd_data_to_gemm_v1.hpp
View file @
705d5a08
...
@@ -164,6 +164,7 @@ template <
...
@@ -164,6 +164,7 @@ template <
index_t
BK1
,
index_t
BK1
,
index_t
GemmMPerBlock
,
index_t
GemmMPerBlock
,
index_t
GemmNPerBlock
,
index_t
GemmNPerBlock
,
index_t
GemmKPerBlock
,
bool
DoPadGemmM
,
bool
DoPadGemmM
,
bool
DoPadGemmN
>
bool
DoPadGemmN
>
struct
TransformConvBwdDataToGemm_v1
struct
TransformConvBwdDataToGemm_v1
...
@@ -308,9 +309,6 @@ struct TransformConvBwdDataToGemm_v1
...
@@ -308,9 +309,6 @@ struct TransformConvBwdDataToGemm_v1
const
auto
YDotSlice
=
math
::
integer_divide_ceil
(
Y
-
i_ytilde
,
YTilde
);
const
auto
YDotSlice
=
math
::
integer_divide_ceil
(
Y
-
i_ytilde
,
YTilde
);
const
auto
XDotSlice
=
math
::
integer_divide_ceil
(
X
-
i_xtilde
,
XTilde
);
const
auto
XDotSlice
=
math
::
integer_divide_ceil
(
X
-
i_xtilde
,
XTilde
);
const
index_t
AK0
=
math
::
integer_divide_ceil
(
ZDotSlice
*
YDotSlice
*
XDotSlice
*
K
,
AK1
);
if
constexpr
(
NDimSpatial
==
2
)
if
constexpr
(
NDimSpatial
==
2
)
{
{
// A: output tensor
// A: output tensor
...
@@ -367,9 +365,11 @@ struct TransformConvBwdDataToGemm_v1
...
@@ -367,9 +365,11 @@ struct TransformConvBwdDataToGemm_v1
const
auto
out_gemmk_gemmm_padded_grid_desc
=
const
auto
out_gemmk_gemmm_padded_grid_desc
=
ck
::
tensor_operation
::
device
::
PadTensorDescriptor
(
ck
::
tensor_operation
::
device
::
PadTensorDescriptor
(
out_gemmk_gemmmraw_grid_desc
,
out_gemmk_gemmmraw_grid_desc
,
make_tuple
(
AK1
,
GemmMPerBlock
),
make_tuple
(
GemmKPerBlock
,
GemmMPerBlock
),
Sequence
<
true
,
DoPadGemmM
>
{});
Sequence
<
true
,
DoPadGemmM
>
{});
const
index_t
AK0
=
out_gemmk_gemmm_padded_grid_desc
.
GetLength
(
I0
)
/
AK1
;
const
auto
out_gemmak0_gemmm_gemmak1_grid_desc
=
transform_tensor_descriptor
(
const
auto
out_gemmak0_gemmm_gemmak1_grid_desc
=
transform_tensor_descriptor
(
out_gemmk_gemmm_padded_grid_desc
,
out_gemmk_gemmm_padded_grid_desc
,
make_tuple
(
make_unmerge_transform
(
make_tuple
(
AK0
,
AK1
)),
make_tuple
(
make_unmerge_transform
(
make_tuple
(
AK0
,
AK1
)),
...
@@ -460,9 +460,11 @@ struct TransformConvBwdDataToGemm_v1
...
@@ -460,9 +460,11 @@ struct TransformConvBwdDataToGemm_v1
const
auto
out_gemmk_gemmm_padded_grid_desc
=
const
auto
out_gemmk_gemmm_padded_grid_desc
=
ck
::
tensor_operation
::
device
::
PadTensorDescriptor
(
ck
::
tensor_operation
::
device
::
PadTensorDescriptor
(
out_gemmk_gemmmraw_grid_desc
,
out_gemmk_gemmmraw_grid_desc
,
make_tuple
(
AK1
,
GemmMPerBlock
),
make_tuple
(
GemmKPerBlock
,
GemmMPerBlock
),
Sequence
<
true
,
DoPadGemmM
>
{});
Sequence
<
true
,
DoPadGemmM
>
{});
const
index_t
AK0
=
out_gemmk_gemmm_padded_grid_desc
.
GetLength
(
I0
)
/
AK1
;
const
auto
out_gemmak0_gemmm_gemmak1_grid_desc
=
transform_tensor_descriptor
(
const
auto
out_gemmak0_gemmm_gemmak1_grid_desc
=
transform_tensor_descriptor
(
out_gemmk_gemmm_padded_grid_desc
,
out_gemmk_gemmm_padded_grid_desc
,
make_tuple
(
make_unmerge_transform
(
make_tuple
(
AK0
,
AK1
)),
make_tuple
(
make_unmerge_transform
(
make_tuple
(
AK0
,
AK1
)),
...
@@ -568,9 +570,6 @@ struct TransformConvBwdDataToGemm_v1
...
@@ -568,9 +570,6 @@ struct TransformConvBwdDataToGemm_v1
const
auto
YDotSlice
=
math
::
integer_divide_ceil
(
Y
-
i_ytilde
,
YTilde
);
const
auto
YDotSlice
=
math
::
integer_divide_ceil
(
Y
-
i_ytilde
,
YTilde
);
const
auto
XDotSlice
=
math
::
integer_divide_ceil
(
X
-
i_xtilde
,
XTilde
);
const
auto
XDotSlice
=
math
::
integer_divide_ceil
(
X
-
i_xtilde
,
XTilde
);
const
index_t
BK0
=
math
::
integer_divide_ceil
(
ZDotSlice
*
YDotSlice
*
XDotSlice
*
K
,
BK1
);
// B weight tensor
// B weight tensor
if
constexpr
(
NDimSpatial
==
2
)
if
constexpr
(
NDimSpatial
==
2
)
{
{
...
@@ -617,9 +616,11 @@ struct TransformConvBwdDataToGemm_v1
...
@@ -617,9 +616,11 @@ struct TransformConvBwdDataToGemm_v1
const
auto
wei_gemmk_gemmn_padded_grid_desc
=
const
auto
wei_gemmk_gemmn_padded_grid_desc
=
ck
::
tensor_operation
::
device
::
PadTensorDescriptor
(
ck
::
tensor_operation
::
device
::
PadTensorDescriptor
(
wei_gemmk_gemmnraw_grid_desc
,
wei_gemmk_gemmnraw_grid_desc
,
make_tuple
(
BK1
,
GemmNPerBlock
),
make_tuple
(
GemmKPerBlock
,
GemmNPerBlock
),
Sequence
<
true
,
DoPadGemmN
>
{});
Sequence
<
true
,
DoPadGemmN
>
{});
const
index_t
BK0
=
wei_gemmk_gemmn_padded_grid_desc
.
GetLength
(
I0
)
/
BK1
;
const
auto
wei_gemmbk0_gemmn_gemmbk1_grid_desc
=
transform_tensor_descriptor
(
const
auto
wei_gemmbk0_gemmn_gemmbk1_grid_desc
=
transform_tensor_descriptor
(
wei_gemmk_gemmn_padded_grid_desc
,
wei_gemmk_gemmn_padded_grid_desc
,
make_tuple
(
make_unmerge_transform
(
make_tuple
(
BK0
,
BK1
)),
make_tuple
(
make_unmerge_transform
(
make_tuple
(
BK0
,
BK1
)),
...
@@ -690,17 +691,19 @@ struct TransformConvBwdDataToGemm_v1
...
@@ -690,17 +691,19 @@ struct TransformConvBwdDataToGemm_v1
make_tuple
(
Sequence
<
1
,
2
,
3
,
0
>
{},
Sequence
<
4
>
{}),
make_tuple
(
Sequence
<
1
,
2
,
3
,
0
>
{},
Sequence
<
4
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}));
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}));
const
auto
wei_gemmk_gemm_padded_grid_desc
=
const
auto
wei_gemmk_gemm
n
_padded_grid_desc
=
ck
::
tensor_operation
::
device
::
PadTensorDescriptor
(
ck
::
tensor_operation
::
device
::
PadTensorDescriptor
(
wei_gemmk_gemmnraw_grid_desc
,
wei_gemmk_gemmnraw_grid_desc
,
make_tuple
(
BK1
,
GemmNPerBlock
),
make_tuple
(
GemmKPerBlock
,
GemmNPerBlock
),
Sequence
<
true
,
DoPadGemmN
>
{});
Sequence
<
true
,
DoPadGemmN
>
{});
const
index_t
BK0
=
wei_gemmk_gemmn_padded_grid_desc
.
GetLength
(
I0
)
/
BK1
;
const
auto
wei_gemmbk0_gemm_gemmbk1_grid_desc
=
transform_tensor_descriptor
(
const
auto
wei_gemmbk0_gemm_gemmbk1_grid_desc
=
transform_tensor_descriptor
(
wei_gemmk_gemm_padded_grid_desc
,
wei_gemmk_gemm
n
_padded_grid_desc
,
make_tuple
(
make_tuple
(
make_unmerge_transform
(
make_tuple
(
BK0
,
BK1
)),
make_unmerge_transform
(
make_tuple
(
BK0
,
BK1
)),
make_pass_through_transform
(
make_pass_through_transform
(
wei_gemmk_gemm_padded_grid_desc
.
GetLength
(
I1
))),
wei_gemmk_gemm
n
_padded_grid_desc
.
GetLength
(
I1
))),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}),
make_tuple
(
Sequence
<
0
,
2
>
{},
Sequence
<
1
>
{}));
make_tuple
(
Sequence
<
0
,
2
>
{},
Sequence
<
1
>
{}));
...
...
include/ck/utility/amd_gemm_dpp.hpp
View file @
705d5a08
...
@@ -5,17 +5,63 @@
...
@@ -5,17 +5,63 @@
#include "ck/utility/common_header.hpp"
#include "ck/utility/common_header.hpp"
#include "ck/utility/math.hpp"
#include "ck/utility/math.hpp"
#include "ck/utility/
amd_gemm
_dpp.hpp"
#include "ck/utility/
inner_product
_dpp
8
.hpp"
namespace
ck
{
namespace
ck
{
namespace
dpp8
{
namespace
dpp8
{
/// Number of lanes that can share data using DPP8 modifiers.
template
<
class
ABDataType
>
constexpr
index_t
lane_group_size
=
8
;
struct
dpp_datatypes
;
__device__
index_t
get_lane_group_local_idx
()
{
return
threadIdx
.
x
/
lane_group_size
;
}
template
<
>
__device__
index_t
get_thread_idx_in_lane_group
()
{
return
threadIdx
.
x
%
lane_group_size
;
}
struct
dpp_datatypes
<
half_t
>
{
// Dot product of `half2_t` and `half2_t` to get `float`. Reducing 2 elements from K in a
// single instruction.
using
a_dtype
=
half_t
;
using
b_dtype
=
half_t
;
using
c_dtype
=
float
;
static
constexpr
index_t
k_per_instr
=
2
;
};
template
<
index_t
MPerThread
,
index_t
NPerThread
,
index_t
KPerThread
,
class
BaseInputType
,
class
AVecDataType
,
class
BVecDataType
,
class
CVecDataType
,
bool
ShareA
>
struct
DppLanegroupGemm
{
using
datatypes_conf
=
dpp_datatypes
<
BaseInputType
>
;
using
ADataType
=
typename
datatypes_conf
::
a_dtype
;
using
BDataType
=
typename
datatypes_conf
::
b_dtype
;
using
CDataType
=
typename
datatypes_conf
::
c_dtype
;
__device__
void
Run
(
const
AVecDataType
&
a_vec
,
const
BVecDataType
&
b_vec
,
CVecDataType
&
c_vec
)
{
constexpr
index_t
num_c_elems_per_thread
=
ShareA
?
MPerThread
:
NPerThread
;
const
vector_type
<
ADataType
,
KPerThread
>
a_vector
{
a_vec
};
const
vector_type
<
BDataType
,
KPerThread
>
b_vector
{
b_vec
};
static_for
<
0
,
num_c_elems_per_thread
,
1
>
{}([
&
](
auto
c_idx
)
{
float
c
=
c_vec
.
template
AsType
<
CDataType
>()(
c_idx
);
// Next `c_idx` implies that we need to pull data from the next lane.
constexpr
index_t
source_lane
=
c_idx
;
static_for
<
0
,
KPerThread
/
datatypes_conf
::
k_per_instr
,
1
>
{}([
&
](
auto
k_chunk
)
{
const
auto
a_k_vec
=
a_vector
.
template
AsType
<
AVecDataType
>()[
k_chunk
];
const
auto
b_k_vec
=
b_vector
.
template
AsType
<
BVecDataType
>()[
k_chunk
];
ck
::
dpp8
::
inner_product_dpp
<
AVecDataType
,
BVecDataType
,
CDataType
,
source_lane
,
ShareA
>
(
a_k_vec
,
b_k_vec
,
c
);
});
c_vec
.
template
AsType
<
CDataType
>()(
c_idx
)
=
c
;
});
}
};
}
// namespace dpp8
}
// namespace dpp8
...
...
include/ck/utility/inner_product_dpp8.hpp
View file @
705d5a08
...
@@ -2,6 +2,7 @@
...
@@ -2,6 +2,7 @@
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#pragma once
#include "amd_gemm_dpp.hpp"
#include "amd_gemm_dpp.hpp"
#include "data_type.hpp"
#include "data_type.hpp"
#include "type_convert.hpp"
#include "type_convert.hpp"
...
@@ -10,6 +11,9 @@ namespace ck {
...
@@ -10,6 +11,9 @@ namespace ck {
namespace
dpp8
{
namespace
dpp8
{
/// Number of lanes that can share data using DPP8 modifiers.
constexpr
index_t
lane_group_size
=
8
;
template
<
int
SrcLaneIdx
>
template
<
int
SrcLaneIdx
>
__device__
void
inline_v_dot2c_dpp8_instr
(
const
half2_t
&
a
,
const
half2_t
&
b
,
float
&
c
);
__device__
void
inline_v_dot2c_dpp8_instr
(
const
half2_t
&
a
,
const
half2_t
&
b
,
float
&
c
);
...
...
include/ck/utility/loop_scheduler.hpp
0 → 100644
View file @
705d5a08
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include "ck/utility/common_header.hpp"
#include "ck/tensor_description/tensor_adaptor.hpp"
namespace
ck
{
enum
struct
LoopScheduler
{
Default
,
Interwave
,
};
constexpr
LoopScheduler
make_default_loop_scheduler
()
{
#if CK_EXPERIMENTAL_DEFAULT_TO_INTER_WAVE_SCHEDULING
return
LoopScheduler
::
Interwave
;
#else
return
LoopScheduler
::
Default
;
#endif // if CK_EXPERIMENTAL_DEFAULT_TO_INTER_WAVE_SCHEDULING
}
}
// namespace ck
include/ck/utility/reduction_operator.hpp
View file @
705d5a08
...
@@ -116,7 +116,15 @@ struct Max
...
@@ -116,7 +116,15 @@ struct Max
template
<
typename
T
>
template
<
typename
T
>
__host__
__device__
static
constexpr
T
GetIdentityValue
()
__host__
__device__
static
constexpr
T
GetIdentityValue
()
{
{
return
NumericLimits
<
T
>::
Lowest
();
if
constexpr
(
is_same_v
<
T
,
bhalf_t
>
)
{
float
val
=
NumericLimits
<
float
>::
Lowest
();
return
type_convert
<
bhalf_t
>
(
val
);
}
else
{
return
NumericLimits
<
T
>::
Lowest
();
}
};
};
__host__
__device__
static
constexpr
bool
__host__
__device__
static
constexpr
bool
...
@@ -138,6 +146,15 @@ struct Max
...
@@ -138,6 +146,15 @@ struct Max
a
=
b
;
a
=
b
;
}
}
__host__
__device__
inline
constexpr
void
operator
()(
bhalf_t
&
a
,
bhalf_t
b
)
const
{
float
a_
=
type_convert
<
float
>
(
a
);
float
b_
=
type_convert
<
float
>
(
b
);
if
(
a_
<
b_
)
a
=
b
;
}
template
<
typename
T
>
template
<
typename
T
>
__host__
__device__
inline
constexpr
void
operator
()(
T
&
a
,
T
b
,
bool
&
changed
)
const
__host__
__device__
inline
constexpr
void
operator
()(
T
&
a
,
T
b
,
bool
&
changed
)
const
{
{
...
@@ -152,6 +169,18 @@ struct Max
...
@@ -152,6 +169,18 @@ struct Max
changed
=
true
;
changed
=
true
;
}
}
}
}
__host__
__device__
inline
constexpr
void
operator
()(
bhalf_t
&
a
,
bhalf_t
b
,
bool
&
changed
)
const
{
float
a_
=
type_convert
<
float
>
(
a
);
float
b_
=
type_convert
<
float
>
(
b
);
if
(
a_
<
b_
)
{
a
=
b
;
changed
=
true
;
}
}
};
};
struct
Min
struct
Min
...
@@ -159,6 +188,15 @@ struct Min
...
@@ -159,6 +188,15 @@ struct Min
template
<
typename
T
>
template
<
typename
T
>
__host__
__device__
static
constexpr
T
GetIdentityValue
()
__host__
__device__
static
constexpr
T
GetIdentityValue
()
{
{
if
constexpr
(
is_same_v
<
T
,
bhalf_t
>
)
{
float
val
=
NumericLimits
<
float
>::
Max
();
return
type_convert
<
bhalf_t
>
(
val
);
}
else
{
return
NumericLimits
<
T
>::
Max
();
}
return
NumericLimits
<
T
>::
Max
();
return
NumericLimits
<
T
>::
Max
();
};
};
...
@@ -181,6 +219,15 @@ struct Min
...
@@ -181,6 +219,15 @@ struct Min
a
=
b
;
a
=
b
;
}
}
__host__
__device__
inline
constexpr
void
operator
()(
bhalf_t
&
a
,
bhalf_t
b
)
const
{
float
a_
=
type_convert
<
float
>
(
a
);
float
b_
=
type_convert
<
float
>
(
b
);
if
(
a_
>
b_
)
a
=
b
;
}
template
<
typename
T
>
template
<
typename
T
>
__host__
__device__
inline
constexpr
void
operator
()(
T
&
a
,
T
b
,
bool
&
changed
)
const
__host__
__device__
inline
constexpr
void
operator
()(
T
&
a
,
T
b
,
bool
&
changed
)
const
{
{
...
@@ -195,6 +242,18 @@ struct Min
...
@@ -195,6 +242,18 @@ struct Min
changed
=
true
;
changed
=
true
;
}
}
}
}
__host__
__device__
inline
constexpr
void
operator
()(
bhalf_t
&
a
,
bhalf_t
b
,
bool
&
changed
)
const
{
float
a_
=
type_convert
<
float
>
(
a
);
float
b_
=
type_convert
<
float
>
(
b
);
if
(
a_
>
b_
)
{
a
=
b
;
changed
=
true
;
}
}
};
};
struct
AMax
struct
AMax
...
...
library/include/ck/library/reference_tensor_operation/cpu/reference_gemm.hpp
View file @
705d5a08
...
@@ -92,11 +92,11 @@ struct ReferenceGemm : public device::BaseOperator
...
@@ -92,11 +92,11 @@ struct ReferenceGemm : public device::BaseOperator
ck
::
type_convert
<
AccDataType
>
(
v_a
)
*
ck
::
type_convert
<
AccDataType
>
(
v_b
);
ck
::
type_convert
<
AccDataType
>
(
v_a
)
*
ck
::
type_convert
<
AccDataType
>
(
v_b
);
}
}
Acc
DataType
v_c
;
C
DataType
v_c
;
arg
.
c_element_op_
(
v_c
,
v_acc
);
arg
.
c_element_op_
(
v_c
,
v_acc
);
arg
.
c_m_n_
(
m
,
n
)
=
ck
::
type_convert
<
CDataType
>
(
v_c
)
;
arg
.
c_m_n_
(
m
,
n
)
=
v_c
;
};
};
make_ParallelTensorFunctor
(
make_ParallelTensorFunctor
(
...
...
library/include/ck/library/reference_tensor_operation/cpu/reference_image_to_column.hpp
0 → 100644
View file @
705d5a08
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <iostream>
#include <type_traits>
#include <sstream>
#include "ck/tensor_operation/gpu/device/device_base.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/library/utility/host_tensor.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
host
{
/**
* \brief Reference implementation for image to column.
*
* Tensor descriptor has [G, N, C, Di, Hi, Wi] data layout.
* G must be equal to 1. Memory layout is [G, N, Di, Hi, Wi, C].
*
* \tparam NDimSpatial Number of spatial dimensions.
* \tparam InputLayout Input Layout.
* \tparam InDataType Input Data Type.
* \tparam OutDataType Output Data Type.
*/
template
<
ck
::
index_t
NDimSpatial
,
typename
InputLayout
,
typename
InDataType
,
typename
OutDataType
,
typename
std
::
enable_if
<
NDimSpatial
>
=
1
&&
NDimSpatial
<=
3
,
bool
>::
type
=
false
>
struct
ReferenceImageToColumn
:
public
device
::
BaseOperator
{
// Argument
struct
Argument
:
public
device
::
BaseArgument
{
public:
Argument
(
const
Tensor
<
InDataType
>&
input
,
Tensor
<
OutDataType
>&
output
,
std
::
vector
<
ck
::
index_t
>
filter_spatial_lengths
,
std
::
vector
<
ck
::
index_t
>
conv_filter_strides
,
std
::
vector
<
ck
::
index_t
>
conv_filter_dilations
,
std
::
vector
<
ck
::
index_t
>
input_left_pads
,
std
::
vector
<
ck
::
index_t
>
input_right_pads
)
:
input_
{
input
},
output_
{
output
},
conv_strides_
{
conv_filter_strides
},
conv_dilations_
{
conv_filter_dilations
},
in_left_pads_
{
input_left_pads
},
in_right_pads_
{
input_right_pads
},
filter_spatial_lengths_
{
filter_spatial_lengths
}
{
initOutputSpatialLengths
();
}
const
Tensor
<
InDataType
>&
input_
;
Tensor
<
OutDataType
>&
output_
;
std
::
vector
<
index_t
>
conv_strides_
;
std
::
vector
<
index_t
>
conv_dilations_
;
std
::
vector
<
index_t
>
in_left_pads_
;
std
::
vector
<
index_t
>
in_right_pads_
;
std
::
vector
<
index_t
>
filter_spatial_lengths_
;
std
::
vector
<
index_t
>
output_spatial_lengths_
;
private:
void
initOutputSpatialLengths
()
{
constexpr
auto
input_offset_to_spatial
=
3
;
for
(
ck
::
index_t
i
=
0
;
i
<
NDimSpatial
;
++
i
)
{
// XEff = (X - 1) * conv_dilation_w + 1;
// Wo = (Wi + in_left_pad_w + in_right_pad_w - XEff) / conv_stride_w + 1;
const
ck
::
index_t
x_eff
=
(
filter_spatial_lengths_
[
i
]
-
1
)
*
conv_dilations_
[
i
]
+
1
;
output_spatial_lengths_
.
push_back
(
(
input_
.
GetLengths
()[
i
+
input_offset_to_spatial
]
+
in_left_pads_
[
i
]
+
in_right_pads_
[
i
]
-
x_eff
)
/
conv_strides_
[
i
]
+
1
);
}
}
};
struct
Invoker
:
public
device
::
BaseInvoker
{
using
Argument
=
ReferenceImageToColumn
::
Argument
;
float
Run
(
const
Argument
&
arg
)
{
if
(
!
(
arg
.
input_
.
GetNumOfDimension
()
==
NDimSpatial
+
3
&&
arg
.
output_
.
GetNumOfDimension
()
==
2
))
{
throw
std
::
runtime_error
(
"wrong! inconsistent dimension"
);
}
const
index_t
N
=
arg
.
input_
.
GetLengths
()[
1
];
const
index_t
C
=
arg
.
input_
.
GetLengths
()[
2
];
if
constexpr
(
NDimSpatial
==
1
)
{
const
index_t
Wo
=
arg
.
output_spatial_lengths_
[
0
];
auto
func
=
[
&
](
auto
n
,
auto
wo
)
{
index_t
row
=
n
*
Wo
+
wo
;
index_t
column
=
0
;
for
(
index_t
x
=
0
;
x
<
arg
.
filter_spatial_lengths_
[
0
];
++
x
)
{
auto
wi
=
static_cast
<
ck
::
long_index_t
>
(
wo
*
arg
.
conv_strides_
[
0
])
+
static_cast
<
ck
::
long_index_t
>
(
x
*
arg
.
conv_dilations_
[
0
])
-
static_cast
<
ck
::
long_index_t
>
(
arg
.
in_left_pads_
[
0
]);
for
(
index_t
c
=
0
;
c
<
C
;
++
c
)
{
if
(
wi
>=
0
&&
ck
::
type_convert
<
std
::
size_t
>
(
wi
)
<
arg
.
input_
.
GetLengths
()[
3
])
{
InDataType
v_in
=
arg
.
input_
(
0
,
n
,
c
,
wi
);
arg
.
output_
(
row
,
column
)
=
ck
::
type_convert
<
OutDataType
>
(
v_in
);
}
column
++
;
}
}
};
make_ParallelTensorFunctor
(
func
,
N
,
Wo
)(
std
::
thread
::
hardware_concurrency
());
return
0
;
}
else
if
constexpr
(
NDimSpatial
==
2
)
{
const
index_t
Ho
=
arg
.
output_spatial_lengths_
[
0
];
const
index_t
Wo
=
arg
.
output_spatial_lengths_
[
1
];
auto
func
=
[
&
](
auto
n
,
auto
ho
,
auto
wo
)
{
index_t
row
=
n
*
Ho
*
Wo
+
ho
*
Wo
+
wo
;
index_t
column
=
0
;
for
(
index_t
y
=
0
;
y
<
arg
.
filter_spatial_lengths_
[
0
];
++
y
)
{
auto
hi
=
static_cast
<
ck
::
long_index_t
>
(
ho
*
arg
.
conv_strides_
[
0
])
+
static_cast
<
ck
::
long_index_t
>
(
y
*
arg
.
conv_dilations_
[
0
])
-
static_cast
<
ck
::
long_index_t
>
(
arg
.
in_left_pads_
[
0
]);
for
(
index_t
x
=
0
;
x
<
arg
.
filter_spatial_lengths_
[
1
];
++
x
)
{
auto
wi
=
static_cast
<
ck
::
long_index_t
>
(
wo
*
arg
.
conv_strides_
[
1
])
+
static_cast
<
ck
::
long_index_t
>
(
x
*
arg
.
conv_dilations_
[
1
])
-
static_cast
<
ck
::
long_index_t
>
(
arg
.
in_left_pads_
[
1
]);
for
(
index_t
c
=
0
;
c
<
C
;
++
c
)
{
if
(
hi
>=
0
&&
ck
::
type_convert
<
std
::
size_t
>
(
hi
)
<
arg
.
input_
.
GetLengths
()[
3
]
&&
wi
>=
0
&&
ck
::
type_convert
<
std
::
size_t
>
(
wi
)
<
arg
.
input_
.
GetLengths
()[
4
])
{
InDataType
v_in
=
arg
.
input_
(
0
,
n
,
c
,
hi
,
wi
);
arg
.
output_
(
row
,
column
)
=
ck
::
type_convert
<
OutDataType
>
(
v_in
);
}
column
++
;
}
}
}
};
make_ParallelTensorFunctor
(
func
,
N
,
Ho
,
Wo
)(
std
::
thread
::
hardware_concurrency
());
return
0
;
}
else
if
constexpr
(
NDimSpatial
==
3
)
{
const
index_t
Do
=
arg
.
output_spatial_lengths_
[
0
];
const
index_t
Ho
=
arg
.
output_spatial_lengths_
[
1
];
const
index_t
Wo
=
arg
.
output_spatial_lengths_
[
2
];
auto
func
=
[
&
](
auto
n
,
auto
d_o
,
auto
ho
,
auto
wo
)
{
index_t
row
=
n
*
Do
*
Ho
*
Wo
+
d_o
*
Ho
*
Wo
+
ho
*
Wo
+
wo
;
index_t
column
=
0
;
for
(
index_t
z
=
0
;
z
<
arg
.
filter_spatial_lengths_
[
0
];
++
z
)
{
auto
di
=
static_cast
<
ck
::
long_index_t
>
(
d_o
*
arg
.
conv_strides_
[
0
])
+
static_cast
<
ck
::
long_index_t
>
(
z
*
arg
.
conv_dilations_
[
0
])
-
static_cast
<
ck
::
long_index_t
>
(
arg
.
in_left_pads_
[
0
]);
for
(
index_t
y
=
0
;
y
<
arg
.
filter_spatial_lengths_
[
1
];
++
y
)
{
auto
hi
=
static_cast
<
ck
::
long_index_t
>
(
ho
*
arg
.
conv_strides_
[
1
])
+
static_cast
<
ck
::
long_index_t
>
(
y
*
arg
.
conv_dilations_
[
1
])
-
static_cast
<
ck
::
long_index_t
>
(
arg
.
in_left_pads_
[
1
]);
for
(
index_t
x
=
0
;
x
<
arg
.
filter_spatial_lengths_
[
2
];
++
x
)
{
auto
wi
=
static_cast
<
ck
::
long_index_t
>
(
wo
*
arg
.
conv_strides_
[
2
])
+
static_cast
<
ck
::
long_index_t
>
(
x
*
arg
.
conv_dilations_
[
2
])
-
static_cast
<
ck
::
long_index_t
>
(
arg
.
in_left_pads_
[
2
]);
for
(
index_t
c
=
0
;
c
<
C
;
++
c
)
{
if
(
di
>=
0
&&
ck
::
type_convert
<
std
::
size_t
>
(
di
)
<
arg
.
input_
.
GetLengths
()[
3
]
&&
hi
>=
0
&&
ck
::
type_convert
<
std
::
size_t
>
(
hi
)
<
arg
.
input_
.
GetLengths
()[
4
]
&&
wi
>=
0
&&
ck
::
type_convert
<
std
::
size_t
>
(
wi
)
<
arg
.
input_
.
GetLengths
()[
5
])
{
InDataType
v_in
=
arg
.
input_
(
0
,
n
,
c
,
di
,
hi
,
wi
);
arg
.
output_
(
row
,
column
)
=
ck
::
type_convert
<
OutDataType
>
(
v_in
);
}
column
++
;
}
}
}
}
};
make_ParallelTensorFunctor
(
func
,
N
,
Do
,
Ho
,
Wo
)(
std
::
thread
::
hardware_concurrency
());
return
0
;
}
}
float
Run
(
const
device
::
BaseArgument
*
p_arg
,
const
StreamConfig
&
/*stream_config*/
=
StreamConfig
{})
override
{
return
Run
(
*
dynamic_cast
<
const
Argument
*>
(
p_arg
));
}
};
static
constexpr
bool
IsValidCompilationParameter
()
{
using
namespace
tensor_layout
::
convolution
;
if
constexpr
(
!
(
std
::
is_same_v
<
InputLayout
,
GNWC
>
||
std
::
is_same_v
<
InputLayout
,
GNHWC
>
||
std
::
is_same_v
<
InputLayout
,
GNDHWC
>
))
{
return
false
;
}
if
constexpr
(
!
(
NDimSpatial
>=
1
&&
NDimSpatial
<=
3
))
{
return
false
;
}
return
true
;
}
bool
IsSupportedArgument
(
const
Argument
&
arg
)
{
const
ck
::
index_t
G
=
arg
.
input_
.
GetLengths
()[
0
];
const
ck
::
index_t
N
=
arg
.
input_
.
GetLengths
()[
1
];
const
ck
::
index_t
C
=
arg
.
input_
.
GetLengths
()[
2
];
const
index_t
NDoHoWo
=
N
*
ck
::
accumulate_n
<
index_t
>
(
arg
.
output_spatial_lengths_
.
begin
(),
NDimSpatial
,
1
,
std
::
multiplies
<>
());
const
index_t
CZYX
=
C
*
ck
::
accumulate_n
<
index_t
>
(
arg
.
filter_spatial_lengths_
.
begin
(),
NDimSpatial
,
1
,
std
::
multiplies
<>
());
if
(
!
(
arg
.
output_
.
GetLengths
()[
0
]
==
static_cast
<
std
::
size_t
>
(
NDoHoWo
)
&&
arg
.
output_
.
GetLengths
()[
1
]
==
static_cast
<
std
::
size_t
>
(
CZYX
)))
{
return
false
;
}
if
(
G
!=
1
)
{
return
false
;
}
return
true
;
}
bool
IsSupportedArgument
(
const
device
::
BaseArgument
*
p_arg
)
override
{
return
IsSupportedArgument
(
*
dynamic_cast
<
const
Argument
*>
(
p_arg
));
}
static
auto
MakeArgument
(
const
Tensor
<
InDataType
>&
input
,
Tensor
<
OutDataType
>&
output
,
std
::
vector
<
ck
::
index_t
>
filter_spatial_lengths
,
std
::
vector
<
ck
::
index_t
>
conv_filter_strides
,
std
::
vector
<
ck
::
index_t
>
conv_filter_dilations
,
std
::
vector
<
ck
::
index_t
>
input_left_pads
,
std
::
vector
<
ck
::
index_t
>
input_right_pads
)
{
return
Argument
{
input
,
output
,
filter_spatial_lengths
,
conv_filter_strides
,
conv_filter_dilations
,
input_left_pads
,
input_right_pads
};
}
static
auto
MakeInvoker
()
{
return
Invoker
{};
}
virtual
std
::
unique_ptr
<
device
::
BaseInvoker
>
MakeInvokerPointer
()
{
return
std
::
make_unique
<
Invoker
>
(
Invoker
{});
}
std
::
string
GetTypeString
()
const
override
{
auto
str
=
std
::
stringstream
();
// clang-format off
str
<<
"ReferenceImageToColumn"
<<
std
::
endl
;
// clang-format on
return
str
.
str
();
}
};
}
// namespace host
}
// namespace tensor_operation
}
// namespace ck
library/include/ck/library/reference_tensor_operation/cpu/reference_maxpool_bwd.hpp
View file @
705d5a08
...
@@ -53,7 +53,16 @@ struct ReferenceMaxPoolBwd : public device::BaseOperator
...
@@ -53,7 +53,16 @@ struct ReferenceMaxPoolBwd : public device::BaseOperator
{
{
int
index
=
arg
.
indices_
.
mData
[
i
];
int
index
=
arg
.
indices_
.
mData
[
i
];
if
(
index
>=
0
&&
index
<
din_length
)
if
(
index
>=
0
&&
index
<
din_length
)
buf
[
index
]
+=
ck
::
type_convert
<
ConputeDataType
>
(
arg
.
dout_
.
mData
[
i
]);
{
if
constexpr
(
is_same_v
<
ConputeDataType
,
bhalf_t
>
)
{
float
buf_val
=
ck
::
type_convert
<
float
>
(
buf
[
index
]);
buf_val
+=
ck
::
type_convert
<
float
>
(
arg
.
dout_
.
mData
[
i
]);
buf
[
index
]
=
ck
::
type_convert
<
ConputeDataType
>
(
buf_val
);
}
else
buf
[
index
]
+=
ck
::
type_convert
<
ConputeDataType
>
(
arg
.
dout_
.
mData
[
i
]);
}
}
}
for
(
int
i
=
0
;
i
<
din_length
;
++
i
)
for
(
int
i
=
0
;
i
<
din_length
;
++
i
)
...
...
library/include/ck/library/reference_tensor_operation/cpu/reference_pool_fwd.hpp
View file @
705d5a08
...
@@ -256,10 +256,12 @@ struct ReferencePoolingFwd : public device::BaseOperator
...
@@ -256,10 +256,12 @@ struct ReferencePoolingFwd : public device::BaseOperator
for
(
ck
::
index_t
y
=
0
;
y
<
arg
.
window_spatial_lengths_
[
0
];
++
y
)
for
(
ck
::
index_t
y
=
0
;
y
<
arg
.
window_spatial_lengths_
[
0
];
++
y
)
{
{
ck
::
index_t
hi
=
ho
*
arg
.
window_strides_
[
0
]
+
y
-
arg
.
in_left_pads_
[
0
];
ck
::
index_t
hi
=
ho
*
arg
.
window_strides_
[
0
]
+
y
*
arg
.
window_dilations_
[
0
]
-
arg
.
in_left_pads_
[
0
];
for
(
ck
::
index_t
x
=
0
;
x
<
arg
.
window_spatial_lengths_
[
1
];
++
x
)
for
(
ck
::
index_t
x
=
0
;
x
<
arg
.
window_spatial_lengths_
[
1
];
++
x
)
{
{
ck
::
index_t
wi
=
wo
*
arg
.
window_strides_
[
1
]
+
x
-
arg
.
in_left_pads_
[
1
];
ck
::
index_t
wi
=
wo
*
arg
.
window_strides_
[
1
]
+
x
*
arg
.
window_dilations_
[
1
]
-
arg
.
in_left_pads_
[
1
];
if
(
hi
>=
0
&&
if
(
hi
>=
0
&&
hi
<
static_cast
<
ck
::
index_t
>
(
arg
.
in_
.
mDesc
.
GetLengths
()[
2
])
&&
hi
<
static_cast
<
ck
::
index_t
>
(
arg
.
in_
.
mDesc
.
GetLengths
()[
2
])
&&
wi
>=
0
&&
wi
>=
0
&&
...
...
library/include/ck/library/tensor_operation_instance/device_operation_instance_factory.hpp
View file @
705d5a08
...
@@ -31,6 +31,9 @@ using F64_Tuple = ck::Tuple<F64>;
...
@@ -31,6 +31,9 @@ using F64_Tuple = ck::Tuple<F64>;
using
F32_Tuple
=
ck
::
Tuple
<
F32
>
;
using
F32_Tuple
=
ck
::
Tuple
<
F32
>
;
using
I32_Tuple
=
ck
::
Tuple
<
I32
>
;
using
I32_Tuple
=
ck
::
Tuple
<
I32
>
;
using
I32_F32_Tuple
=
ck
::
Tuple
<
I32
,
F32
>
;
using
I32_F32_Tuple
=
ck
::
Tuple
<
I32
,
F32
>
;
using
I8_Tuple
=
ck
::
Tuple
<
I8
>
;
using
F32_F32_Tuple
=
ck
::
Tuple
<
F32
,
F32
>
;
// GEMM layout
// GEMM layout
using
Row
=
ck
::
tensor_layout
::
gemm
::
RowMajor
;
using
Row
=
ck
::
tensor_layout
::
gemm
::
RowMajor
;
...
@@ -95,9 +98,11 @@ using AddFastGelu = ck::tensor_operation::element_wise::AddFastGelu;
...
@@ -95,9 +98,11 @@ using AddFastGelu = ck::tensor_operation::element_wise::AddFastGelu;
using
AddReluAdd
=
ck
::
tensor_operation
::
element_wise
::
AddReluAdd
;
using
AddReluAdd
=
ck
::
tensor_operation
::
element_wise
::
AddReluAdd
;
using
FastGelu
=
ck
::
tensor_operation
::
element_wise
::
FastGelu
;
using
FastGelu
=
ck
::
tensor_operation
::
element_wise
::
FastGelu
;
using
AddMultiply
=
ck
::
tensor_operation
::
element_wise
::
AddMultiply
;
using
AddMultiply
=
ck
::
tensor_operation
::
element_wise
::
AddMultiply
;
using
MultiplyAdd
=
ck
::
tensor_operation
::
element_wise
::
MultiplyAdd
;
using
ScaleAdd
=
ck
::
tensor_operation
::
element_wise
::
ScaleAdd
;
using
ScaleAdd
=
ck
::
tensor_operation
::
element_wise
::
ScaleAdd
;
using
Gelu
=
ck
::
tensor_operation
::
element_wise
::
Gelu
;
using
Gelu
=
ck
::
tensor_operation
::
element_wise
::
Gelu
;
using
Swish
=
ck
::
tensor_operation
::
element_wise
::
Swish
;
using
Swish
=
ck
::
tensor_operation
::
element_wise
::
Swish
;
using
Add
=
ck
::
tensor_operation
::
element_wise
::
Add
;
template
<
typename
Activation
>
template
<
typename
Activation
>
using
Activation_Mul_Clamp
=
ck
::
tensor_operation
::
element_wise
::
Activation_Mul_Clamp
<
Activation
>
;
using
Activation_Mul_Clamp
=
ck
::
tensor_operation
::
element_wise
::
Activation_Mul_Clamp
<
Activation
>
;
...
...
library/include/ck/library/tensor_operation_instance/gpu/avg_pool3d_bwd.hpp
0 → 100644
View file @
705d5a08
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include "ck/tensor_operation/gpu/device/device_avgpool_bwd.hpp"
#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
#ifdef CK_ENABLE_FP16
void
add_device_avgpool_bwd_ndhwc_f16_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceAvgPoolBwd
<
3
,
F16
,
F16
,
NDHWC
,
NDHWC
>>>&
);
#endif
#ifdef CK_ENABLE_BF16
void
add_device_avgpool_bwd_ndhwc_bf16_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceAvgPoolBwd
<
3
,
BF16
,
BF16
,
NDHWC
,
NDHWC
>>>&
);
#endif
#ifdef CK_ENABLE_FP32
void
add_device_avgpool_bwd_ndhwc_f32_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceAvgPoolBwd
<
3
,
F32
,
F32
,
NDHWC
,
NDHWC
>>>&
);
#endif
template
<
typename
DOutDataType
,
typename
DInDataType
,
typename
InLayout
,
typename
OutLayout
>
struct
DeviceOperationInstanceFactory
<
ck
::
tensor_operation
::
device
::
DeviceAvgPoolBwd
<
3
,
DOutDataType
,
DInDataType
,
InLayout
,
OutLayout
>>
{
using
DeviceOp
=
DeviceAvgPoolBwd
<
3
,
DOutDataType
,
DInDataType
,
InLayout
,
OutLayout
>
;
static
auto
GetInstances
()
{
std
::
vector
<
std
::
unique_ptr
<
DeviceOp
>>
op_ptrs
;
if
constexpr
(
is_same_v
<
InLayout
,
NDHWC
>
&&
is_same_v
<
OutLayout
,
NDHWC
>
)
{
#ifdef CK_ENABLE_FP16
if
constexpr
(
is_same_v
<
DOutDataType
,
F16
>
&&
is_same_v
<
DInDataType
,
F16
>
)
add_device_avgpool_bwd_ndhwc_f16_instances
(
op_ptrs
);
#endif
#ifdef CK_ENABLE_BF16
else
if
constexpr
(
is_same_v
<
DOutDataType
,
BF16
>
&&
is_same_v
<
DInDataType
,
BF16
>
)
add_device_avgpool_bwd_ndhwc_bf16_instances
(
op_ptrs
);
#endif
#ifdef CK_ENABLE_FP32
else
if
constexpr
(
is_same_v
<
DOutDataType
,
F32
>
&&
is_same_v
<
DInDataType
,
F32
>
)
add_device_avgpool_bwd_ndhwc_f32_instances
(
op_ptrs
);
#endif
}
return
op_ptrs
;
}
};
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/include/ck/library/tensor_operation_instance/gpu/gemm.hpp
View file @
705d5a08
...
@@ -23,7 +23,7 @@ void add_device_gemm_dl_f16_f16_f16_km_kn_mn_instances(
...
@@ -23,7 +23,7 @@ void add_device_gemm_dl_f16_f16_f16_km_kn_mn_instances(
DeviceGemm
<
Col
,
Row
,
Row
,
F16
,
F16
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
DeviceGemm
<
Col
,
Row
,
Row
,
F16
,
F16
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
instances
);
void
add_device_gemm_
dl_
dpp
8
_f16_f16_f16_km_kn_mn_instances
(
void
add_device_gemm_dpp_f16_f16_f16_km_kn_mn_instances
(
std
::
vector
<
std
::
unique_ptr
<
std
::
vector
<
std
::
unique_ptr
<
DeviceGemm
<
Col
,
Row
,
Row
,
F16
,
F16
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
DeviceGemm
<
Col
,
Row
,
Row
,
F16
,
F16
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
instances
);
...
@@ -38,7 +38,7 @@ void add_device_gemm_dl_f16_f16_f16_km_nk_mn_instances(
...
@@ -38,7 +38,7 @@ void add_device_gemm_dl_f16_f16_f16_km_nk_mn_instances(
DeviceGemm
<
Col
,
Col
,
Row
,
F16
,
F16
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
DeviceGemm
<
Col
,
Col
,
Row
,
F16
,
F16
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
instances
);
void
add_device_gemm_
dl_
dpp
8
_f16_f16_f16_km_nk_mn_instances
(
void
add_device_gemm_dpp_f16_f16_f16_km_nk_mn_instances
(
std
::
vector
<
std
::
unique_ptr
<
std
::
vector
<
std
::
unique_ptr
<
DeviceGemm
<
Col
,
Col
,
Row
,
F16
,
F16
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
DeviceGemm
<
Col
,
Col
,
Row
,
F16
,
F16
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
instances
);
...
@@ -53,7 +53,7 @@ void add_device_gemm_dl_f16_f16_f16_mk_kn_mn_instances(
...
@@ -53,7 +53,7 @@ void add_device_gemm_dl_f16_f16_f16_mk_kn_mn_instances(
DeviceGemm
<
Row
,
Row
,
Row
,
F16
,
F16
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
DeviceGemm
<
Row
,
Row
,
Row
,
F16
,
F16
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
instances
);
void
add_device_gemm_
dl_
dpp
8
_f16_f16_f16_mk_kn_mn_instances
(
void
add_device_gemm_dpp_f16_f16_f16_mk_kn_mn_instances
(
std
::
vector
<
std
::
unique_ptr
<
std
::
vector
<
std
::
unique_ptr
<
DeviceGemm
<
Row
,
Row
,
Row
,
F16
,
F16
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
DeviceGemm
<
Row
,
Row
,
Row
,
F16
,
F16
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
instances
);
...
@@ -68,7 +68,7 @@ void add_device_gemm_dl_f16_f16_f16_mk_nk_mn_instances(
...
@@ -68,7 +68,7 @@ void add_device_gemm_dl_f16_f16_f16_mk_nk_mn_instances(
DeviceGemm
<
Row
,
Col
,
Row
,
F16
,
F16
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
DeviceGemm
<
Row
,
Col
,
Row
,
F16
,
F16
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
instances
);
void
add_device_gemm_
dl_
dpp
8
_f16_f16_f16_mk_nk_mn_instances
(
void
add_device_gemm_dpp_f16_f16_f16_mk_nk_mn_instances
(
std
::
vector
<
std
::
unique_ptr
<
std
::
vector
<
std
::
unique_ptr
<
DeviceGemm
<
Row
,
Col
,
Row
,
F16
,
F16
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
DeviceGemm
<
Row
,
Col
,
Row
,
F16
,
F16
,
F16
,
PassThrough
,
PassThrough
,
PassThrough
>>>&
instances
);
instances
);
...
@@ -374,7 +374,7 @@ struct DeviceOperationInstanceFactory<
...
@@ -374,7 +374,7 @@ struct DeviceOperationInstanceFactory<
#ifdef DL_KERNELS
#ifdef DL_KERNELS
add_device_gemm_dl_f16_f16_f16_mk_kn_mn_instances
(
op_ptrs
);
add_device_gemm_dl_f16_f16_f16_mk_kn_mn_instances
(
op_ptrs
);
add_device_gemm_dl_f16_f16_f16_mk_kn_mn_irregular_instances
(
op_ptrs
);
add_device_gemm_dl_f16_f16_f16_mk_kn_mn_irregular_instances
(
op_ptrs
);
add_device_gemm_
dl_
dpp
8
_f16_f16_f16_mk_kn_mn_instances
(
op_ptrs
);
add_device_gemm_dpp_f16_f16_f16_mk_kn_mn_instances
(
op_ptrs
);
#endif
#endif
add_device_gemm_xdl_c_shuffle_f16_f16_f16_mk_kn_mn_instances
(
op_ptrs
);
add_device_gemm_xdl_c_shuffle_f16_f16_f16_mk_kn_mn_instances
(
op_ptrs
);
}
}
...
@@ -385,7 +385,7 @@ struct DeviceOperationInstanceFactory<
...
@@ -385,7 +385,7 @@ struct DeviceOperationInstanceFactory<
#ifdef DL_KERNELS
#ifdef DL_KERNELS
add_device_gemm_dl_f16_f16_f16_mk_nk_mn_instances
(
op_ptrs
);
add_device_gemm_dl_f16_f16_f16_mk_nk_mn_instances
(
op_ptrs
);
add_device_gemm_dl_f16_f16_f16_mk_nk_mn_irregular_instances
(
op_ptrs
);
add_device_gemm_dl_f16_f16_f16_mk_nk_mn_irregular_instances
(
op_ptrs
);
add_device_gemm_
dl_
dpp
8
_f16_f16_f16_mk_nk_mn_instances
(
op_ptrs
);
add_device_gemm_dpp_f16_f16_f16_mk_nk_mn_instances
(
op_ptrs
);
#endif
#endif
add_device_gemm_xdl_c_shuffle_f16_f16_f16_mk_nk_mn_instances
(
op_ptrs
);
add_device_gemm_xdl_c_shuffle_f16_f16_f16_mk_nk_mn_instances
(
op_ptrs
);
add_device_gemm_xdl_c_shuffle_2_stage_f16_f16_f16_mk_nk_mn_instances
(
op_ptrs
);
add_device_gemm_xdl_c_shuffle_2_stage_f16_f16_f16_mk_nk_mn_instances
(
op_ptrs
);
...
@@ -397,7 +397,7 @@ struct DeviceOperationInstanceFactory<
...
@@ -397,7 +397,7 @@ struct DeviceOperationInstanceFactory<
#ifdef DL_KERNELS
#ifdef DL_KERNELS
add_device_gemm_dl_f16_f16_f16_km_kn_mn_instances
(
op_ptrs
);
add_device_gemm_dl_f16_f16_f16_km_kn_mn_instances
(
op_ptrs
);
add_device_gemm_dl_f16_f16_f16_km_kn_mn_irregular_instances
(
op_ptrs
);
add_device_gemm_dl_f16_f16_f16_km_kn_mn_irregular_instances
(
op_ptrs
);
add_device_gemm_
dl_
dpp
8
_f16_f16_f16_km_kn_mn_instances
(
op_ptrs
);
add_device_gemm_dpp_f16_f16_f16_km_kn_mn_instances
(
op_ptrs
);
#endif
#endif
add_device_gemm_xdl_c_shuffle_f16_f16_f16_km_kn_mn_instances
(
op_ptrs
);
add_device_gemm_xdl_c_shuffle_f16_f16_f16_km_kn_mn_instances
(
op_ptrs
);
}
}
...
@@ -408,7 +408,7 @@ struct DeviceOperationInstanceFactory<
...
@@ -408,7 +408,7 @@ struct DeviceOperationInstanceFactory<
#ifdef DL_KERNELS
#ifdef DL_KERNELS
add_device_gemm_dl_f16_f16_f16_km_nk_mn_instances
(
op_ptrs
);
add_device_gemm_dl_f16_f16_f16_km_nk_mn_instances
(
op_ptrs
);
add_device_gemm_dl_f16_f16_f16_km_nk_mn_irregular_instances
(
op_ptrs
);
add_device_gemm_dl_f16_f16_f16_km_nk_mn_irregular_instances
(
op_ptrs
);
add_device_gemm_
dl_
dpp
8
_f16_f16_f16_km_nk_mn_instances
(
op_ptrs
);
add_device_gemm_dpp_f16_f16_f16_km_nk_mn_instances
(
op_ptrs
);
#endif
#endif
add_device_gemm_xdl_c_shuffle_f16_f16_f16_km_nk_mn_instances
(
op_ptrs
);
add_device_gemm_xdl_c_shuffle_f16_f16_f16_km_nk_mn_instances
(
op_ptrs
);
}
}
...
...
library/include/ck/library/tensor_operation_instance/gpu/gemm_bilinear.hpp
View file @
705d5a08
...
@@ -69,6 +69,58 @@ void add_device_gemm_bilinear_xdl_c_shuffle_f16_f16_f16_f16_mk_nk_mn_mn_instance
...
@@ -69,6 +69,58 @@ void add_device_gemm_bilinear_xdl_c_shuffle_f16_f16_f16_f16_mk_nk_mn_mn_instance
PassThrough
,
PassThrough
,
Bilinear
>>>&
instances
);
Bilinear
>>>&
instances
);
void
add_device_gemm_bilinear_wmma_c_shuffle_i8_i8_i8_i8_mk_kn_mn_mn_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGemmMultipleD
<
Row
,
Row
,
Row_Tuple
,
Row
,
I8
,
I8
,
I8_Tuple
,
I8
,
PassThrough
,
PassThrough
,
Bilinear
>>>&
instances
);
void
add_device_gemm_bilinear_wmma_c_shuffle_i8_i8_i8_i8_mk_nk_mn_mn_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGemmMultipleD
<
Row
,
Col
,
Row_Tuple
,
Row
,
I8
,
I8
,
I8_Tuple
,
I8
,
PassThrough
,
PassThrough
,
Bilinear
>>>&
instances
);
void
add_device_gemm_bilinear_wmma_c_shuffle_i8_i8_i8_i8_km_kn_mn_mn_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGemmMultipleD
<
Col
,
Row
,
Row_Tuple
,
Row
,
I8
,
I8
,
I8_Tuple
,
I8
,
PassThrough
,
PassThrough
,
Bilinear
>>>&
instances
);
void
add_device_gemm_bilinear_wmma_c_shuffle_i8_i8_i8_i8_km_nk_mn_mn_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGemmMultipleD
<
Col
,
Col
,
Row_Tuple
,
Row
,
I8
,
I8
,
I8_Tuple
,
I8
,
PassThrough
,
PassThrough
,
Bilinear
>>>&
instances
);
// GEMM + Bilinear
// GEMM + Bilinear
template
<
typename
ALayout
,
template
<
typename
ALayout
,
typename
BLayout
,
typename
BLayout
,
...
@@ -135,6 +187,30 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGemmMu
...
@@ -135,6 +187,30 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGemmMu
op_ptrs
);
op_ptrs
);
}
}
}
}
else
if
constexpr
(
is_same_v
<
ADataType
,
std
::
int8_t
>
&&
is_same_v
<
BDataType
,
std
::
int8_t
>
&&
is_same_v
<
DDataType
,
std
::
int8_t
>
&&
is_same_v
<
EDataType
,
std
::
int8_t
>
)
{
if
constexpr
(
is_same_v
<
ALayout
,
Row
>
&&
is_same_v
<
BLayout
,
Row
>
&&
is_same_v
<
DLayout
,
Row
>
&&
is_same_v
<
ELayout
,
Row
>
)
{
add_device_gemm_bilinear_wmma_c_shuffle_i8_i8_i8_i8_mk_kn_mn_mn_instances
(
op_ptrs
);
}
else
if
constexpr
(
is_same_v
<
ALayout
,
Row
>
&&
is_same_v
<
BLayout
,
Col
>
&&
is_same_v
<
DLayout
,
Row
>
&&
is_same_v
<
ELayout
,
Row
>
)
{
add_device_gemm_bilinear_wmma_c_shuffle_i8_i8_i8_i8_mk_nk_mn_mn_instances
(
op_ptrs
);
}
else
if
constexpr
(
is_same_v
<
ALayout
,
Col
>
&&
is_same_v
<
BLayout
,
Row
>
&&
is_same_v
<
DLayout
,
Row
>
&&
is_same_v
<
ELayout
,
Row
>
)
{
add_device_gemm_bilinear_wmma_c_shuffle_i8_i8_i8_i8_km_kn_mn_mn_instances
(
op_ptrs
);
}
else
if
constexpr
(
is_same_v
<
ALayout
,
Col
>
&&
is_same_v
<
BLayout
,
Col
>
&&
is_same_v
<
DLayout
,
Row
>
&&
is_same_v
<
ELayout
,
Row
>
)
{
add_device_gemm_bilinear_wmma_c_shuffle_i8_i8_i8_i8_km_nk_mn_mn_instances
(
op_ptrs
);
}
}
return
op_ptrs
;
return
op_ptrs
;
}
}
...
...
library/include/ck/library/tensor_operation_instance/gpu/gemm_multiply_add.hpp
0 → 100644
View file @
705d5a08
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <cstdlib>
#include <vector>
#include <memory>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/device_gemm_multiple_d.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
void
add_device_gemm_multiply_add_xdl_c_shuffle_f16_f16_f16_f16_f16_mk_kn_mn_mn_mn_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGemmMultipleD
<
Row
,
Row
,
Row_Row_Tuple
,
Row
,
F16
,
F16
,
F16_F16_Tuple
,
F16
,
PassThrough
,
PassThrough
,
MultiplyAdd
>>>&
);
void
add_device_gemm_multiply_add_xdl_c_shuffle_f16_f16_f16_f16_f16_mk_nk_mn_mn_mn_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGemmMultipleD
<
Row
,
Col
,
Row_Row_Tuple
,
Row
,
F16
,
F16
,
F16_F16_Tuple
,
F16
,
PassThrough
,
PassThrough
,
MultiplyAdd
>>>&
);
void
add_device_gemm_multiply_add_xdl_c_shuffle_f16_f8_f32_f32_f16_mk_kn_mn_mn_mn_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGemmMultipleD
<
Row
,
Row
,
Row_Row_Tuple
,
Row
,
F16
,
F8
,
F32_F32_Tuple
,
F16
,
PassThrough
,
PassThrough
,
MultiplyAdd
>>>&
);
void
add_device_gemm_multiply_add_xdl_c_shuffle_f16_f8_f32_f32_f16_mk_nk_mn_mn_mn_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGemmMultipleD
<
Row
,
Col
,
Row_Row_Tuple
,
Row
,
F16
,
F8
,
F32_F32_Tuple
,
F16
,
PassThrough
,
PassThrough
,
MultiplyAdd
>>>&
);
// GEMM + Multiply + Add
template
<
typename
ALayout
,
typename
BLayout
,
typename
D0Layout
,
typename
D1Layout
,
typename
ELayout
,
typename
ADataType
,
typename
BDataType
,
typename
D0DataType
,
typename
D1DataType
,
typename
EDataType
>
struct
DeviceOperationInstanceFactory
<
ck
::
tensor_operation
::
device
::
DeviceGemmMultipleD
<
ALayout
,
BLayout
,
ck
::
Tuple
<
D0Layout
,
D1Layout
>
,
ELayout
,
ADataType
,
BDataType
,
ck
::
Tuple
<
D0DataType
,
D1DataType
>
,
EDataType
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
,
ck
::
tensor_operation
::
element_wise
::
MultiplyAdd
>>
{
using
DeviceOp
=
DeviceGemmMultipleD
<
ALayout
,
BLayout
,
ck
::
Tuple
<
D0Layout
,
D1Layout
>
,
ELayout
,
ADataType
,
BDataType
,
ck
::
Tuple
<
D0DataType
,
D1DataType
>
,
EDataType
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
,
ck
::
tensor_operation
::
element_wise
::
MultiplyAdd
>
;
static
auto
GetInstances
()
{
std
::
vector
<
std
::
unique_ptr
<
DeviceOp
>>
op_ptrs
;
if
constexpr
(
is_same_v
<
ADataType
,
half_t
>
&&
is_same_v
<
BDataType
,
half_t
>
&&
is_same_v
<
D0DataType
,
half_t
>
&&
is_same_v
<
D1DataType
,
half_t
>
&&
is_same_v
<
EDataType
,
half_t
>
)
{
if
constexpr
(
is_same_v
<
ALayout
,
Row
>
&&
is_same_v
<
BLayout
,
Row
>
&&
is_same_v
<
D0Layout
,
Row
>
&&
is_same_v
<
D1Layout
,
Row
>
&&
is_same_v
<
ELayout
,
Row
>
)
{
add_device_gemm_multiply_add_xdl_c_shuffle_f16_f16_f16_f16_f16_mk_kn_mn_mn_mn_instances
(
op_ptrs
);
}
else
if
constexpr
(
is_same_v
<
ALayout
,
Row
>
&&
is_same_v
<
BLayout
,
Col
>
&&
is_same_v
<
D0Layout
,
Row
>
&&
is_same_v
<
D1Layout
,
Row
>
&&
is_same_v
<
ELayout
,
Row
>
)
{
add_device_gemm_multiply_add_xdl_c_shuffle_f16_f16_f16_f16_f16_mk_nk_mn_mn_mn_instances
(
op_ptrs
);
}
}
if
constexpr
(
is_same_v
<
ADataType
,
half_t
>
&&
is_same_v
<
BDataType
,
f8_t
>
&&
is_same_v
<
D0DataType
,
float
>
&&
is_same_v
<
D1DataType
,
float
>
&&
is_same_v
<
EDataType
,
half_t
>
)
{
if
constexpr
(
is_same_v
<
ALayout
,
Row
>
&&
is_same_v
<
BLayout
,
Row
>
&&
is_same_v
<
D0Layout
,
Row
>
&&
is_same_v
<
D1Layout
,
Row
>
&&
is_same_v
<
ELayout
,
Row
>
)
{
add_device_gemm_multiply_add_xdl_c_shuffle_f16_f8_f32_f32_f16_mk_kn_mn_mn_mn_instances
(
op_ptrs
);
}
else
if
constexpr
(
is_same_v
<
ALayout
,
Row
>
&&
is_same_v
<
BLayout
,
Col
>
&&
is_same_v
<
D0Layout
,
Row
>
&&
is_same_v
<
D1Layout
,
Row
>
&&
is_same_v
<
ELayout
,
Row
>
)
{
add_device_gemm_multiply_add_xdl_c_shuffle_f16_f8_f32_f32_f16_mk_nk_mn_mn_mn_instances
(
op_ptrs
);
}
}
return
op_ptrs
;
}
};
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
Prev
1
2
3
4
5
6
7
8
9
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment