Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
a3b4c5cb
Commit
a3b4c5cb
authored
Jun 03, 2022
by
wangshaojie6
Browse files
merge develop branch and add gridwise pipeline v3
parents
48918ab9
1677cf70
Changes
361
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
1334 additions
and
445 deletions
+1334
-445
include/ck/config.hpp
include/ck/config.hpp
+26
-4
include/ck/hip_version.hpp.in
include/ck/hip_version.hpp.in
+0
-28
include/ck/host_utility/device_prop.hpp
include/ck/host_utility/device_prop.hpp
+50
-0
include/ck/options.hpp
include/ck/options.hpp
+3
-0
include/ck/stream_config.hpp
include/ck/stream_config.hpp
+10
-0
include/ck/tensor_description/tensor_descriptor_helper.hpp
include/ck/tensor_description/tensor_descriptor_helper.hpp
+21
-9
include/ck/tensor_operation/gpu/block/blockwise_gemm_dl_v2r3.hpp
.../ck/tensor_operation/gpu/block/blockwise_gemm_dl_v2r3.hpp
+7
-9
include/ck/tensor_operation/gpu/block/blockwise_gemm_xdlops.hpp
...e/ck/tensor_operation/gpu/block/blockwise_gemm_xdlops.hpp
+262
-16
include/ck/tensor_operation/gpu/block/blockwise_tensor_slice_transfer_v5r1.hpp
...ration/gpu/block/blockwise_tensor_slice_transfer_v5r1.hpp
+5
-6
include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v4r1.hpp
...ion/gpu/block/thread_group_tensor_slice_transfer_v4r1.hpp
+19
-22
include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v6r1.hpp
...ion/gpu/block/thread_group_tensor_slice_transfer_v6r1.hpp
+24
-27
include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v6r2.hpp
...ion/gpu/block/thread_group_tensor_slice_transfer_v6r2.hpp
+29
-32
include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v6r3.hpp
...ion/gpu/block/thread_group_tensor_slice_transfer_v6r3.hpp
+33
-36
include/ck/tensor_operation/gpu/device/convolution_backward_weight_specialization.hpp
...gpu/device/convolution_backward_weight_specialization.hpp
+17
-0
include/ck/tensor_operation/gpu/device/device_5ary_elementwise.hpp
...k/tensor_operation/gpu/device/device_5ary_elementwise.hpp
+333
-0
include/ck/tensor_operation/gpu/device/device_base.hpp
include/ck/tensor_operation/gpu/device/device_base.hpp
+13
-6
include/ck/tensor_operation/gpu/device/device_batched_gemm_reduce_xdl_cshuffle.hpp
...on/gpu/device/device_batched_gemm_reduce_xdl_cshuffle.hpp
+149
-186
include/ck/tensor_operation/gpu/device/device_batched_gemm_xdl.hpp
...k/tensor_operation/gpu/device/device_batched_gemm_xdl.hpp
+26
-64
include/ck/tensor_operation/gpu/device/device_binary_elementwise.hpp
...tensor_operation/gpu/device/device_binary_elementwise.hpp
+234
-0
include/ck/tensor_operation/gpu/device/device_cgemm.hpp
include/ck/tensor_operation/gpu/device/device_cgemm.hpp
+73
-0
No files found.
include/ck/config.hpp
View file @
a3b4c5cb
...
...
@@ -26,17 +26,14 @@
#endif
#endif
// buffer resour
se, wave siz
e
// buffer resour
c
e
#ifndef __HIP_DEVICE_COMPILE__ // for host code
#define CK_BUFFER_RESOURCE_3RD_DWORD -1
#define CK_GPU_WAVE_SIZE -1
#elif defined(__gfx803__) || defined(__gfx900__) || defined(__gfx906__) || defined(__gfx908__) || \
defined(__gfx90a__) // for GPU code
#define CK_BUFFER_RESOURCE_3RD_DWORD 0x00020000
#define CK_GPU_WAVE_SIZE 64
#elif defined(__gfx1030__) // for GPU code
#define CK_BUFFER_RESOURCE_3RD_DWORD 0x31014000
#define CK_GPU_WAVE_SIZE 32
#endif
// FMA instruction
...
...
@@ -79,6 +76,12 @@
#define CK_USE_AMD_BUFFER_ATOMIC_ADD_FLOAT 0
#endif
#if defined(__gfx90a__) // for GPU code
#define CK_USE_AMD_BUFFER_ATOMIC_MAX_FLOAT64 1
#else
#define CK_USE_AMD_BUFFER_ATOMIC_MAX_FLOAT64 0
#endif
// inline asm
#define CK_USE_AMD_INLINE_ASM 1
...
...
@@ -98,6 +101,7 @@
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
#define CK_EXPERIMENTAL_USE_BUFFER_STORE_OOB_CHECK_OFFSET_TRICK 1
#define CK_EXPERIMENTAL_USE_BUFFER_ATOMIC_ADD_OOB_CHECK_OFFSET_TRICK 1
#define CK_EXPERIMENTAL_USE_BUFFER_ATOMIC_MAX_OOB_CHECK_OFFSET_TRICK 1
// experimental feature: in-regsiter sub-dword transpose
#define CK_EXPERIMENTAL_USE_IN_REGISTER_SUB_DWORD_TRANSPOSE 1
...
...
@@ -112,6 +116,10 @@
// experimental feature: use __builtin_memcpy instead of union to do bit_cast
#define CK_EXPERIMENTAL_USE_MEMCPY_FOR_BIT_CAST 1
// experimental feature: optimize for inter-wave scheduling policy
#define CK_EXPERIMENTAL_INTER_WAVE_SCHEDULING 0
#define CK_EXPERIMENTAL_INTER_WAVE_SCHEDULING_MAC_CLUSTERS 1
// hack: have underlying assumption that need to be satsified, otherwise it's a bug
// hack for forcing register to keep idx_diff_low_const in SGPR. idx_diff_low_const must be
// thread-invariant, otherwise it's a bug
...
...
@@ -141,9 +149,23 @@ enum struct InMemoryDataOperationEnum
{
Set
,
AtomicAdd
,
AtomicMax
,
Add
};
template
<
InMemoryDataOperationEnum
...
Is
>
struct
InMemoryDataOperationEnumSequence
{
static
constexpr
int
mSize
=
sizeof
...(
Is
);
__host__
__device__
static
constexpr
InMemoryDataOperationEnum
At
(
int
I
)
{
// the last dummy element is to prevent compiler complain about empty array, when mSize = 0
const
InMemoryDataOperationEnum
mData
[
mSize
+
1
]
=
{
Is
...,
InMemoryDataOperationEnum
::
Set
};
return
mData
[
I
];
}
};
// TODO: no longer needed, remove this
enum
struct
ActivTypeEnum
{
...
...
include/ck/hip_version.hpp.in
deleted
100644 → 0
View file @
48918ab9
#pragma once
// "_PACKAGE_" to avoid name contentions: the macros like
// HIP_VERSION_MAJOR are defined in HIP_VERSION.h.
// clang-format off
#define CK_HIP_PACKAGE_VERSION_MAJOR @CK_HIP_VERSION_MAJOR@
#define CK_HIP_PACKAGE_VERSION_MINOR @CK_HIP_VERSION_MINOR@
#define CK_HIP_PACKAGE_VERSION_PATCH @CK_HIP_VERSION_PATCH@
// clang-format on
#ifndef CK_HIP_PACKAGE_VERSION_MAJOR
#define CK_HIP_PACKAGE_VERSION_MAJOR 0
#endif
#ifndef CK_HIP_PACKAGE_VERSION_MINOR
#define CK_HIP_PACKAGE_VERSION_MINOR 0
#endif
#ifndef CK_HIP_PACKAGE_VERSION_PATCH
#define CK_HIP_PACKAGE_VERSION_PATCH 0
#endif
// 3 decimal digits for major and minor, 6 digits for patch number.
// Max number is 999,999,999999 == 0xE8,D4A5,0FFF that fits into 64-bit math.
#if CK_HIP_PACKAGE_VERSION_MAJOR > 999 || CK_HIP_PACKAGE_VERSION_MAJOR > 999 || \
CK_HIP_PACKAGE_VERSION_PATCH > 999999
#error "Too big HIP version number(s)"
#endif
#define CK_HIP_PACKAGE_VERSION_FLAT \
((CK_HIP_PACKAGE_VERSION_MAJOR * 1000ULL + CK_HIP_PACKAGE_VERSION_MINOR) * 1000000 + \
CK_HIP_PACKAGE_VERSION_PATCH)
include/ck/host_utility/device_prop.hpp
0 → 100644
View file @
a3b4c5cb
#pragma once
#include <string>
#include <map>
namespace
ck
{
inline
std
::
string
get_device_name
()
{
hipDeviceProp_t
props
{};
int
device
;
auto
status
=
hipGetDevice
(
&
device
);
if
(
status
!=
hipSuccess
)
{
return
std
::
string
();
}
status
=
hipGetDeviceProperties
(
&
props
,
device
);
if
(
status
!=
hipSuccess
)
{
return
std
::
string
();
}
const
std
::
string
raw_name
(
props
.
gcnArchName
);
// https://github.com/ROCmSoftwarePlatform/MIOpen/blob/8498875aef84878e04c1eabefdf6571514891086/src/target_properties.cpp#L40
static
std
::
map
<
std
::
string
,
std
::
string
>
device_name_map
=
{
{
"Ellesmere"
,
"gfx803"
},
{
"Baffin"
,
"gfx803"
},
{
"RacerX"
,
"gfx803"
},
{
"Polaris10"
,
"gfx803"
},
{
"Polaris11"
,
"gfx803"
},
{
"Tonga"
,
"gfx803"
},
{
"Fiji"
,
"gfx803"
},
{
"gfx800"
,
"gfx803"
},
{
"gfx802"
,
"gfx803"
},
{
"gfx804"
,
"gfx803"
},
{
"Vega10"
,
"gfx900"
},
{
"gfx901"
,
"gfx900"
},
{
"10.3.0 Sienna_Cichlid 18"
,
"gfx1030"
},
};
const
auto
name
=
raw_name
.
substr
(
0
,
raw_name
.
find
(
':'
));
// str.substr(0, npos) returns str.
auto
match
=
device_name_map
.
find
(
name
);
if
(
match
!=
device_name_map
.
end
())
return
match
->
second
;
return
name
;
}
}
// namespace ck
include/ck/options.hpp
0 → 100644
View file @
a3b4c5cb
#pragma once
#define CK_TIME_KERNEL 1
include/ck/stream_config.hpp
0 → 100644
View file @
a3b4c5cb
#pragma once
#include <hip/hip_runtime.h>
#include <hip/hip_fp16.h>
struct
StreamConfig
{
hipStream_t
stream_id_
=
nullptr
;
bool
time_kernel_
=
false
;
};
include/ck/tensor_description/tensor_descriptor_helper.hpp
View file @
a3b4c5cb
#ifndef CK_TENSOR_DESCRIPTOR_HELPER_HPP
#define CK_TENSOR_DESCRIPTOR_HELPER_HPP
#pragma once
#include "common_header.hpp"
#include "tensor_descriptor.hpp"
#include "multi_index_transform_helper.hpp"
...
...
@@ -35,6 +33,12 @@ __host__ __device__ constexpr auto calculate_element_space_size_impl(const Lengt
}
#endif
// Lengths..., Strides... could be:
// 1) index_t, which is known at run-time, or
// 2) Number<>, which is known at compile-time
// element_space_size could be:
// 1) long_index_t, or
// 2) LongNumber<>
template
<
typename
...
Lengths
,
typename
...
Strides
,
typename
enable_if
<
sizeof
...(
Lengths
)
==
sizeof
...(
Strides
),
bool
>
::
type
=
false
>
...
...
@@ -68,10 +72,10 @@ __host__ __device__ constexpr auto make_naive_tensor_descriptor(const Tuple<Leng
}
};
const
auto
element_space_size
=
f
(
f
,
Number
<
0
>
{},
Number
<
1
>
{});
const
auto
element_space_size
=
f
(
f
,
Number
<
0
>
{},
Long
Number
<
1
>
{});
#else
const
auto
element_space_size
=
calculate_element_space_size_impl
(
lengths
,
strides
,
Number
<
0
>
{},
Number
<
1
>
{});
calculate_element_space_size_impl
(
lengths
,
strides
,
Number
<
0
>
{},
Long
Number
<
1
>
{});
#endif
return
TensorDescriptor
<
remove_cv_t
<
decltype
(
transforms
)
>
,
...
...
@@ -82,9 +86,12 @@ __host__ __device__ constexpr auto make_naive_tensor_descriptor(const Tuple<Leng
element_space_size
};
}
// Lengths... c
an
be:
// 1) index_t, which is known at run-time
// Lengths... c
ould
be:
// 1) index_t, which is known at run-time
, or
// 2) Number<>, which is known at compile-time
// element_space_size could be:
// 1) long_index_t, or
// 2) LongNumber<>
template
<
typename
...
Lengths
>
__host__
__device__
constexpr
auto
make_naive_tensor_descriptor_packed
(
const
Tuple
<
Lengths
...
>&
lengths
)
...
...
@@ -100,7 +107,7 @@ make_naive_tensor_descriptor_packed(const Tuple<Lengths...>& lengths)
constexpr
auto
visible_dim_hidden_ids
=
typename
arithmetic_sequence_gen
<
1
,
N
+
1
,
1
>::
type
{};
const
auto
element_space_size
=
container_reduce
(
lengths
,
math
::
multiplies
{},
Number
<
1
>
{});
const
auto
element_space_size
=
container_reduce
(
lengths
,
math
::
multiplies
{},
Long
Number
<
1
>
{});
return
TensorDescriptor
<
remove_cv_t
<
decltype
(
transforms
)
>
,
remove_cv_t
<
decltype
(
low_dim_hidden_idss
)
>
,
...
...
@@ -110,6 +117,12 @@ make_naive_tensor_descriptor_packed(const Tuple<Lengths...>& lengths)
element_space_size
};
}
// Lengths... could be:
// 1) index_t, which is known at run-time, or
// 2) Number<>, which is known at compile-time
// align could be:
// 1) index_t, or
// 2) Number<>
template
<
typename
...
Lengths
,
typename
Align
>
__host__
__device__
constexpr
auto
make_naive_tensor_descriptor_aligned
(
const
Tuple
<
Lengths
...
>&
lengths
,
Align
align
)
...
...
@@ -146,4 +159,3 @@ make_naive_tensor_descriptor_aligned(const Tuple<Lengths...>& lengths, Align ali
}
}
// namespace ck
#endif
include/ck/tensor_operation/gpu/block/blockwise_gemm_dl
ops
_v2r3.hpp
→
include/ck/tensor_operation/gpu/block/blockwise_gemm_dl_v2r3.hpp
View file @
a3b4c5cb
#ifndef CK_BLOCKWISE_GEMM_DLOPS_V2R3_HPP
#define CK_BLOCKWISE_GEMM_DLOPS_V2R3_HPP
#pragma once
#include "common_header.hpp"
#include "tensor_adaptor.hpp"
#include "threadwise_tensor_slice_transfer_v
2
.hpp"
#include "threadwise_contraction_dl
ops
.hpp"
#include "threadwise_tensor_slice_transfer_v
4r1
.hpp"
#include "threadwise_contraction_dl.hpp"
namespace
ck
{
...
...
@@ -41,7 +39,7 @@ template <index_t BlockSize,
typename
enable_if
<
ABlockDesc_BK0_BM_BK1
::
IsKnownAtCompileTime
()
&&
BBlockDesc_BK0_BN_BK1
::
IsKnownAtCompileTime
(),
bool
>
::
type
=
false
>
struct
BlockwiseGemmDl
ops
_A_BK0_BM_BK1_B_BK0_BN_BK1_C_BM0_BM1_BN0_BN1_pipeline_BM0_2_BN0_2
struct
BlockwiseGemmDl_A_BK0_BM_BK1_B_BK0_BN_BK1_C_BM0_BM1_BN0_BN1_pipeline_BM0_2_BN0_2
{
using
AIndex
=
MultiIndex
<
3
>
;
using
BIndex
=
MultiIndex
<
3
>
;
...
...
@@ -148,7 +146,7 @@ struct BlockwiseGemmDlops_A_BK0_BM_BK1_B_BK0_BN_BK1_C_BM0_BM1_BN0_BN1_pipeline_B
MakeBBlockDescriptor_BK0_BN0_BN1_BK1
(
BBlockDesc_BK0_BN_BK1
{});
public:
__device__
BlockwiseGemmDl
ops
_A_BK0_BM_BK1_B_BK0_BN_BK1_C_BM0_BM1_BN0_BN1_pipeline_BM0_2_BN0_2
()
__device__
BlockwiseGemmDl_A_BK0_BM_BK1_B_BK0_BN_BK1_C_BM0_BM1_BN0_BN1_pipeline_BM0_2_BN0_2
()
:
c_thread_origin_data_idx_
{
CalculateCThreadOriginOnBlock_BM0_BM1_BN0_BN1
(
get_thread_local_1d_id
())},
a_thread_copy_
{
...
...
@@ -175,6 +173,7 @@ struct BlockwiseGemmDlops_A_BK0_BM_BK1_B_BK0_BN_BK1_C_BM0_BM1_BN0_BN1_pipeline_B
"wrong!"
);
// TODO: remove this restriction
static_assert
(
BM0
==
2
,
"wrong"
);
static_assert
(
BM0
==
2
&&
BN0
==
2
,
"wrong"
);
}
...
...
@@ -226,7 +225,7 @@ struct BlockwiseGemmDlops_A_BK0_BM_BK1_B_BK0_BN_BK1_C_BM0_BM1_BN0_BN1_pipeline_B
b_thread_desc_bk0_bn0_bn1_bk1_
.
GetElementSpaceSize
());
constexpr
auto
threadwise_contraction
=
ThreadwiseContractionDl
ops
_A_TK0_TM0_TM1_TK1_B_TK0_TN0_TN1_TK1_C_TM0_TM1_TN0_TN1
<
ThreadwiseContractionDl_A_TK0_TM0_TM1_TK1_B_TK0_TN0_TN1_TK1_C_TM0_TM1_TN0_TN1
<
FloatA
,
FloatB
,
FloatC
,
...
...
@@ -407,4 +406,3 @@ struct BlockwiseGemmDlops_A_BK0_BM_BK1_B_BK0_BN_BK1_C_BM0_BM1_BN0_BN1_pipeline_B
};
}
// namespace ck
#endif
include/ck/tensor_operation/gpu/block/blockwise_gemm_xdlops.hpp
View file @
a3b4c5cb
#ifndef CK_BLOCKWISE_GEMM_XDLOPS_HPP
#define CK_BLOCKWISE_GEMM_XDLOPS_HPP
#pragma once
#include "common_header.hpp"
#include "threadwise_tensor_slice_transfer.hpp"
#include "xdlops_gemm.hpp"
#include "tensor_adaptor.hpp"
#include "thread_group.hpp"
namespace
ck
{
enum
struct
LoopScheduler
{
Default
,
Interwave
,
};
constexpr
LoopScheduler
make_default_loop_scheduler
()
{
#if CK_EXPERIMENTAL_INTER_WAVE_SCHEDULING
return
LoopScheduler
::
Interwave
;
#else
return
LoopScheduler
::
Default
;
#endif // if CK_EXPERIMENTAL_INTER_WAVE_SCHEDULING
}
template
<
index_t
BlockSize
,
typename
FloatAB
,
typename
FloatAcc
,
...
...
@@ -17,7 +31,8 @@ template <index_t BlockSize,
index_t
NPerXDL
,
index_t
MRepeat
,
index_t
NRepeat
,
index_t
KPack
>
index_t
KPack
,
index_t
K1Factor
=
1
>
struct
BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
{
static
constexpr
auto
I0
=
Number
<
0
>
{};
...
...
@@ -25,7 +40,9 @@ struct BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
static
constexpr
auto
I2
=
Number
<
2
>
{};
static
constexpr
auto
I3
=
Number
<
3
>
{};
static
constexpr
index_t
WaveSize
=
64
;
using
ThisThreadBlock
=
ThisThreadBlock
<
BlockSize
>
;
static
constexpr
index_t
WaveSize
=
get_warp_size
();
static
constexpr
index_t
MPerBlock
=
AK0MK1BlockDesc
{}.
GetLength
(
I1
);
static
constexpr
index_t
NPerBlock
=
BK0NK1BlockDesc
{}.
GetLength
(
I1
);
...
...
@@ -55,7 +72,7 @@ struct BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
__device__
static
auto
GetWaveIdx
()
{
const
index_t
thread_id
=
get_t
hread
_
loc
al_1d_i
d
();
const
index_t
thread_id
=
ThisT
hread
B
loc
k
::
GetThreadI
d
();
constexpr
auto
threadid_to_wave_idx_adaptor
=
make_single_stage_tensor_adaptor
(
make_tuple
(
make_merge_transform
(
make_tuple
(
MWaves
,
NWaves
,
WaveSize
))),
...
...
@@ -122,8 +139,8 @@ struct BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
BK0NK1BlockDesc
::
IsKnownAtCompileTime
(),
"wrong! Desc should be known at compile-time"
);
static_assert
(
BlockSize
==
MWaves
*
NWaves
*
WaveSize
,
"
BlockSize
!= MWaves * NWaves * WaveSize
\n
"
);
static_assert
(
ThisThreadBlock
::
GetNumOfThread
()
==
MWaves
*
NWaves
*
WaveSize
,
"
ThisThreadBlock::GetNumOfThread()
!= MWaves * NWaves * WaveSize
\n
"
);
static_assert
(
MPerBlock
%
(
MPerXDL
*
MRepeat
)
==
0
&&
NPerBlock
%
(
NPerXDL
*
NRepeat
)
==
0
,
"wrong!"
);
...
...
@@ -257,7 +274,8 @@ struct BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
auto
b_thread_buf
=
make_static_buffer
<
AddressSpaceEnum
::
Vgpr
,
FloatAB
>
(
b_thread_desc_
.
GetElementSpaceSize
());
#if 1
if
constexpr
(
K1Factor
!=
1
)
{
//static_for<0, KPerThread, KPack>{}([&](auto k) {
// static_for<0, MRepeat, 1>{}([&](auto m0) {
// // read A
...
...
@@ -347,8 +365,9 @@ struct BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
});
});
#else
}
else
{
static_for
<
0
,
MRepeat
,
1
>
{}([
&
](
auto
m0
)
{
// read A
a_thread_copy_
.
Run
(
a_block_desc_m0_m1_m2_k
,
...
...
@@ -391,10 +410,10 @@ struct BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
});
});
});
#endif
}
}
pr
ivate
:
pr
otected
:
// A[M0, M1, M2, KPerThread]
static
constexpr
auto
a_thread_desc_
=
make_naive_tensor_descriptor_packed
(
make_tuple
(
Number
<
MRepeat
>
{},
I1
,
I1
,
Number
<
KPerBlock
>
{}));
...
...
@@ -407,8 +426,8 @@ struct BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
static
constexpr
auto
c_thread_desc_
=
make_naive_tensor_descriptor_packed
(
make_tuple
(
Number
<
MRepeat
>
{},
Number
<
NRepeat
>
{},
xdlops_gemm
.
GetRegSizePerXdlops
()));
static
constexpr
index_t
A_K1_vec
=
A_K1
/
2
;
static
constexpr
index_t
B_K1_vec
=
B_K1
/
2
;
static
constexpr
index_t
A_K1_vec
=
A_K1
/
K1Factor
;
static
constexpr
index_t
B_K1_vec
=
B_K1
/
K1Factor
;
using
AThreadCopy
=
ThreadwiseTensorSliceTransfer_v4
<
FloatAB
,
FloatAB
,
...
...
@@ -434,5 +453,232 @@ struct BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
BThreadCopy
b_thread_copy_
{
CalculateBThreadOriginDataIndex
()};
};
// Note: To facilitate the inter-wave loop scheduler, we need to explicitly set the macro
// CK_EXPERIMENTAL_INTER_WAVE_SCHEDULING=1 as a few intrinsics are not yet available in
// the latest ROCm release. For unsupported compilers, inter-wave loop scheduler falls back to the
// default loop scheduler which is given by the macro CK_EXPERIMENTAL_INTER_WAVE_SCHEDULING=0
template
<
index_t
BlockSize
,
typename
FloatAB
,
typename
FloatAcc
,
typename
AK0MK1BlockDesc
,
typename
BK0NK1BlockDesc
,
index_t
MPerXDL
,
index_t
NPerXDL
,
index_t
MRepeat
,
index_t
NRepeat
,
index_t
KPack
,
index_t
NumMacClusters
=
CK_EXPERIMENTAL_INTER_WAVE_SCHEDULING_MAC_CLUSTERS
>
struct
BlockwiseGemmXdlopsInterwave_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
:
public
BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
<
BlockSize
,
FloatAB
,
FloatAcc
,
AK0MK1BlockDesc
,
BK0NK1BlockDesc
,
MPerXDL
,
NPerXDL
,
MRepeat
,
NRepeat
,
KPack
>
{
using
Base
=
BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
<
BlockSize
,
FloatAB
,
FloatAcc
,
AK0MK1BlockDesc
,
BK0NK1BlockDesc
,
MPerXDL
,
NPerXDL
,
MRepeat
,
NRepeat
,
KPack
>
;
#if CK_EXPERIMENTAL_INTER_WAVE_SCHEDULING
using
Base
::
a_block_desc_m0_m1_m2_k
;
using
Base
::
A_K1
;
using
Base
::
b_block_desc_n0_n1_n2_k
;
using
Base
::
B_K1
;
using
Base
::
c_thread_buf_
;
using
Base
::
c_thread_desc_
;
using
Base
::
CalculateAThreadOriginDataIndex
;
using
Base
::
CalculateBThreadOriginDataIndex
;
using
Base
::
I0
;
using
Base
::
I1
;
using
Base
::
KPerThread
;
using
Base
::
xdlops_gemm
;
static
constexpr
index_t
KPerInnerLoop
=
math
::
max
(
KPerThread
/
NumMacClusters
,
KPack
);
// 2-wave optimized blockwise gemm
template
<
typename
ABlockBuffer
,
typename
BBlockBuffer
,
typename
CThreadBuffer
>
__device__
void
Run
(
const
ABlockBuffer
&
a_block_buf
,
const
BBlockBuffer
&
b_block_buf
,
CThreadBuffer
&
c_thread_buf
)
const
{
auto
a_thread_buf
=
make_static_buffer
<
AddressSpaceEnum
::
Vgpr
,
FloatAB
>
(
a_thread_desc_
.
GetElementSpaceSize
());
auto
b_thread_buf
=
make_static_buffer
<
AddressSpaceEnum
::
Vgpr
,
FloatAB
>
(
b_thread_desc_
.
GetElementSpaceSize
());
static_for
<
0
,
KPerThread
,
KPerInnerLoop
>
{}([
&
](
auto
k
)
{
static_for
<
0
,
MRepeat
,
1
>
{}([
&
](
auto
m0
)
{
// read A
a_thread_copy_
.
Run
(
a_block_desc_m0_m1_m2_k
,
make_tuple
(
m0
,
I0
,
I0
,
k
),
a_block_buf
,
a_thread_desc_
,
make_tuple
(
m0
,
I0
,
I0
,
I0
),
a_thread_buf
);
});
static_for
<
0
,
NRepeat
,
1
>
{}([
&
](
auto
n0
)
{
// read B
b_thread_copy_
.
Run
(
b_block_desc_n0_n1_n2_k
,
make_tuple
(
n0
,
I0
,
I0
,
k
),
b_block_buf
,
b_thread_desc_
,
make_tuple
(
n0
,
I0
,
I0
,
I0
),
b_thread_buf
);
});
__builtin_amdgcn_sched_barrier
();
// NOTE: Synchronize threads in a workgroup at the start of each MAC cluster, but except
// the first, as we can shorten non-MAC cluster a bit and there's no observable negative
// impact. The desired effect is waves in a workgroup executing MAC in sync. This avoids
// some out-of-sync waves hijacking MAC resource from other workgroups and reducing the
// chance of latency hiding by waiting for the rest of the workgroup at the eventual
// sync point.
if
constexpr
(
k
.
value
!=
0
||
KPerInnerLoop
==
KPerThread
)
{
asm
volatile
(
"s_barrier"
::
);
__builtin_amdgcn_sched_barrier
();
}
static_for
<
0
,
KPerInnerLoop
,
KPack
>
{}([
&
](
auto
k_
)
{
static_for
<
0
,
MRepeat
,
1
>
{}([
&
](
auto
m0
)
{
static_for
<
0
,
NRepeat
,
1
>
{}([
&
](
auto
n0
)
{
vector_type
<
FloatAB
,
KPack
>
a_thread_vec
;
vector_type
<
FloatAB
,
KPack
>
b_thread_vec
;
static_for
<
0
,
KPack
,
1
>
{}([
&
](
auto
i
)
{
a_thread_vec
.
template
AsType
<
FloatAB
>()(
i
)
=
a_thread_buf
[
Number
<
a_thread_desc_
.
CalculateOffset
(
make_tuple
(
m0
,
0
,
0
,
k_
+
i
))
>
{}];
b_thread_vec
.
template
AsType
<
FloatAB
>()(
i
)
=
b_thread_buf
[
Number
<
b_thread_desc_
.
CalculateOffset
(
make_tuple
(
n0
,
0
,
0
,
k_
+
i
))
>
{}];
});
using
mfma_input_type
=
typename
vector_type
<
FloatAB
,
xdlops_gemm
.
K1PerXdlops
>::
type
;
constexpr
index_t
c_offset
=
c_thread_desc_
.
CalculateOffset
(
make_tuple
(
m0
,
n0
,
0
));
// The block_sync_lds() here performs double duty:
// A) safeguard against data hazard because barrier from blockwise_gemm is
// moved here B) reduce VMEM FIFO congestion by applying small delays to
// different wavefronts It is performed near the end of MAC cluster to
// minimize lgkmcnt penalty
if
constexpr
(
k
.
value
==
KPerThread
-
KPerInnerLoop
&&
k_
.
value
==
KPerInnerLoop
-
KPack
&&
m0
.
value
==
MRepeat
-
1
&&
n0
.
value
==
NRepeat
-
1
)
{
__builtin_amdgcn_sched_barrier
();
block_sync_lds
();
__builtin_amdgcn_sched_barrier
();
}
// TODO: insert setprio in more precise manner since we
// could have more than >1 MFMA instructions in single call
xdlops_gemm
.
template
Run
(
a_thread_vec
.
template
AsType
<
mfma_input_type
>(),
b_thread_vec
.
template
AsType
<
mfma_input_type
>(),
c_thread_buf
.
GetVectorTypeReference
(
Number
<
c_offset
>{}));
if
constexpr
(
k_
.
value
==
0
&&
m0
.
value
==
0
&&
n0
.
value
==
0
)
{
__builtin_amdgcn_sched_barrier
();
__builtin_amdgcn_s_setprio
(
1
);
__builtin_amdgcn_sched_barrier
();
}
});
});
});
__builtin_amdgcn_sched_barrier
();
__builtin_amdgcn_s_setprio
(
0
);
__builtin_amdgcn_sched_barrier
();
});
}
protected:
// A[M0, M1, M2, KPerInnerLoop]
static
constexpr
auto
a_thread_desc_
=
make_naive_tensor_descriptor_packed
(
make_tuple
(
Number
<
MRepeat
>
{},
I1
,
I1
,
Number
<
KPerInnerLoop
>
{}));
// B[N0, N1, N2, KPerInnerLoop]
static
constexpr
auto
b_thread_desc_
=
make_naive_tensor_descriptor_packed
(
make_tuple
(
Number
<
NRepeat
>
{},
I1
,
I1
,
Number
<
KPerInnerLoop
>
{}));
using
AThreadCopy
=
ThreadwiseTensorSliceTransfer_v4
<
FloatAB
,
FloatAB
,
decltype
(
a_block_desc_m0_m1_m2_k
),
decltype
(
a_thread_desc_
),
Sequence
<
1
,
1
,
1
,
KPerInnerLoop
>
,
Sequence
<
0
,
1
,
2
,
3
>
,
3
,
A_K1
,
A_K1
>
;
using
BThreadCopy
=
ThreadwiseTensorSliceTransfer_v4
<
FloatAB
,
FloatAB
,
decltype
(
b_block_desc_n0_n1_n2_k
),
decltype
(
b_thread_desc_
),
Sequence
<
1
,
1
,
1
,
KPerInnerLoop
>
,
Sequence
<
0
,
1
,
2
,
3
>
,
3
,
B_K1
,
B_K1
>
;
AThreadCopy
a_thread_copy_
{
CalculateAThreadOriginDataIndex
()};
BThreadCopy
b_thread_copy_
{
CalculateBThreadOriginDataIndex
()};
#endif // #if CK_EXPERIMENTAL_INTER_WAVE_SCHEDULING
};
template
<
index_t
BlockSize
,
typename
FloatAB
,
typename
FloatAcc
,
typename
AK0MK1BlockDesc
,
typename
BK0NK1BlockDesc
,
index_t
MPerXDL
,
index_t
NPerXDL
,
index_t
MRepeat
,
index_t
NRepeat
,
index_t
KPack
,
LoopScheduler
LoopSched
>
constexpr
auto
BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_Selector
()
{
if
constexpr
(
LoopSched
==
LoopScheduler
::
Default
)
{
return
BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
<
BlockSize
,
FloatAB
,
FloatAcc
,
AK0MK1BlockDesc
,
BK0NK1BlockDesc
,
MPerXDL
,
NPerXDL
,
MRepeat
,
NRepeat
,
KPack
>
{};
}
else
if
constexpr
(
LoopSched
==
LoopScheduler
::
Interwave
)
{
return
BlockwiseGemmXdlopsInterwave_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
<
BlockSize
,
FloatAB
,
FloatAcc
,
AK0MK1BlockDesc
,
BK0NK1BlockDesc
,
MPerXDL
,
NPerXDL
,
MRepeat
,
NRepeat
,
KPack
>
{};
}
};
}
// namespace ck
#endif
include/ck/tensor_operation/gpu/block/blockwise_tensor_slice_transfer_v5r1.hpp
View file @
a3b4c5cb
...
...
@@ -45,8 +45,8 @@ struct BlockwiseTensorSliceTransfer_v5r1
src_desc
,
make_zero_multi_index
<
nDim
>
(),
dst_desc
,
make_zero_multi_index
<
nDim
>
())
{
static_assert
(
nDim
==
remove_ref
erence_t
<
remove_cv
_t
<
SrcDesc
>
>
::
GetNumOfDimension
()
&&
nDim
==
remove_ref
erence_t
<
remove_cv
_t
<
DstDesc
>
>
::
GetNumOfDimension
()
&&
static_assert
(
nDim
==
remove_
cv
ref_t
<
SrcDesc
>::
GetNumOfDimension
()
&&
nDim
==
remove_
cv
ref_t
<
DstDesc
>::
GetNumOfDimension
()
&&
nDim
==
BlockSliceLengths
::
Size
()
&&
nDim
==
ThreadSliceLengths
::
Size
()
&&
nDim
==
ThreadClusterLengths
::
Size
()
&&
nDim
==
ThreadClusterArrangeOrder
::
Size
()
&&
...
...
@@ -75,14 +75,13 @@ struct BlockwiseTensorSliceTransfer_v5r1
}
}
template
<
typename
SrcBuffer
,
typename
SrcStepHacks
>
__device__
void
RunRead
(
const
SrcDesc
&
src_desc
,
const
SrcBuffer
&
src_buf
,
const
SrcStepHacks
&
src_step_hacks
)
template
<
typename
SrcBuffer
>
__device__
void
RunRead
(
const
SrcDesc
&
src_desc
,
const
SrcBuffer
&
src_buf
)
{
if
(
BlockSize
==
thread_cluster_desc_
.
GetElementSize
()
or
get_thread_local_1d_id
()
<
thread_cluster_desc_
.
GetElementSize
())
{
threadwise_transfer_
.
RunRead
(
src_desc
,
src_buf
,
src_step_hacks
);
threadwise_transfer_
.
RunRead
(
src_desc
,
src_buf
);
}
}
...
...
include/ck/tensor_operation/gpu/block/
blockwise
_tensor_slice_transfer_v4r1.hpp
→
include/ck/tensor_operation/gpu/block/
thread_group
_tensor_slice_transfer_v4r1.hpp
View file @
a3b4c5cb
#ifndef CK_BLOCKWISE_TENSOR_SLICE_TRANSFER_V4R1_HPP
#define CK_BLOCKWISE_TENSOR_SLICE_TRANSFER_V4R1_HPP
#pragma once
#include "common_header.hpp"
#include "tensor_descriptor.hpp"
#include "tensor_descriptor_helper.hpp"
...
...
@@ -13,7 +11,7 @@ namespace ck {
// 1. Use StaticallyIndexedArray instead of C array for thread buffer
// 2. ThreadwiseTensorSliceTransfer_v3 does not keep reference to tensor descriptor
// 3. ThreadwiseTensorSliceTransfer_v3::Run() does not construct new tensor coordinate
template
<
index_t
BlockSize
,
template
<
typename
ThreadGroup
,
typename
SrcElementwiseOperation
,
typename
DstElementwiseOperation
,
InMemoryDataOperationEnum
DstInMemOp
,
...
...
@@ -35,7 +33,7 @@ template <index_t BlockSize,
bool
ThreadTransferSrcResetCoordinateAfterRun
,
bool
ThreadTransferDstResetCoordinateAfterRun
,
index_t
NumThreadScratch
=
1
>
struct
Blockwise
TensorSliceTransfer_v4r1
struct
ThreadGroup
TensorSliceTransfer_v4r1
{
static
constexpr
index_t
nDim
=
remove_reference_t
<
SrcDesc
>::
GetNumOfDimension
();
...
...
@@ -43,7 +41,7 @@ struct BlockwiseTensorSliceTransfer_v4r1
using
Index
=
MultiIndex
<
nDim
>
;
__device__
constexpr
Blockwise
TensorSliceTransfer_v4r1
(
__device__
constexpr
ThreadGroup
TensorSliceTransfer_v4r1
(
const
SrcDesc
&
src_desc
,
const
Index
&
src_block_slice_origin
,
const
SrcElementwiseOperation
&
src_element_op
,
...
...
@@ -58,8 +56,8 @@ struct BlockwiseTensorSliceTransfer_v4r1
dst_element_op
)
{
static_assert
(
nDim
==
remove_ref
erence_t
<
remove_cv
_t
<
SrcDesc
>
>
::
GetNumOfDimension
()
&&
nDim
==
remove_ref
erence_t
<
remove_cv
_t
<
DstDesc
>
>
::
GetNumOfDimension
()
&&
static_assert
(
nDim
==
remove_
cv
ref_t
<
SrcDesc
>::
GetNumOfDimension
()
&&
nDim
==
remove_
cv
ref_t
<
DstDesc
>::
GetNumOfDimension
()
&&
nDim
==
ThreadClusterLengths
::
Size
()
&&
nDim
==
ThreadClusterArrangeOrder
::
Size
()
&&
nDim
==
SrcDimAccessOrder
::
Size
()
&&
nDim
==
DstDimAccessOrder
::
Size
(),
...
...
@@ -69,14 +67,14 @@ struct BlockwiseTensorSliceTransfer_v4r1
is_same
<
BlockSliceLengths
,
decltype
(
thread_slice_lengths
*
ThreadClusterLengths
{})
>
{},
"wrong! threads should be mapped to cover entire slicing window"
);
static_assert
(
BlockSize
>=
thread_cluster_desc_
.
GetElementSize
(),
"wrong!
BlockSize
too small"
);
static_assert
(
ThreadGroup
::
GetNumOfThread
()
>=
thread_cluster_desc_
.
GetElementSize
(),
"wrong!
ThreadGroup::GetNumOfThread()
too small"
);
if
(
BlockSize
==
thread_cluster_desc_
.
GetElementSize
()
or
get_thread_local_1d_i
d
()
<
thread_cluster_desc_
.
GetElementSize
())
if
(
ThreadGroup
::
GetNumOfThread
()
==
thread_cluster_desc_
.
GetElementSize
()
or
ThreadGroup
::
GetThreadI
d
()
<
thread_cluster_desc_
.
GetElementSize
())
{
const
auto
thread_cluster_idx
=
thread_cluster_desc_
.
CalculateBottomIndex
(
make_multi_index
(
get_thread_local_1d_i
d
()));
make_multi_index
(
ThreadGroup
::
GetThreadI
d
()));
const
auto
thread_data_idx_begin
=
thread_cluster_idx
*
thread_slice_lengths
;
...
...
@@ -92,8 +90,8 @@ struct BlockwiseTensorSliceTransfer_v4r1
const
SrcBuffer
&
src_buf
,
Number
<
ThreadScratchId
>
thread_scratch_id
=
Number
<
ThreadScratchId
>
{})
{
if
(
BlockSize
==
thread_cluster_desc_
.
GetElementSize
()
or
get_thread_local_1d_i
d
()
<
thread_cluster_desc_
.
GetElementSize
())
if
(
ThreadGroup
::
GetNumOfThread
()
==
thread_cluster_desc_
.
GetElementSize
()
or
ThreadGroup
::
GetThreadI
d
()
<
thread_cluster_desc_
.
GetElementSize
())
{
threadwise_transfer_
.
RunRead
(
src_desc
,
src_buf
,
thread_scratch_id
);
}
...
...
@@ -104,8 +102,8 @@ struct BlockwiseTensorSliceTransfer_v4r1
DstBuffer
&
dst_buf
,
Number
<
ThreadScratchId
>
thread_scratch_id
=
Number
<
ThreadScratchId
>
{})
{
if
(
BlockSize
==
thread_cluster_desc_
.
GetElementSize
()
or
get_thread_local_1d_i
d
()
<
thread_cluster_desc_
.
GetElementSize
())
if
(
ThreadGroup
::
GetNumOfThread
()
==
thread_cluster_desc_
.
GetElementSize
()
or
ThreadGroup
::
GetThreadI
d
()
<
thread_cluster_desc_
.
GetElementSize
())
{
threadwise_transfer_
.
RunWrite
(
dst_desc
,
dst_buf
,
thread_scratch_id
);
}
...
...
@@ -124,8 +122,8 @@ struct BlockwiseTensorSliceTransfer_v4r1
__device__
void
MoveSrcSliceWindow
(
const
SrcDesc
&
src_desc
,
const
Index
&
step
)
{
if
(
BlockSize
==
thread_cluster_desc_
.
GetElementSize
()
or
get_thread_local_1d_i
d
()
<
thread_cluster_desc_
.
GetElementSize
())
if
(
ThreadGroup
::
GetNumOfThread
()
==
thread_cluster_desc_
.
GetElementSize
()
or
ThreadGroup
::
GetThreadI
d
()
<
thread_cluster_desc_
.
GetElementSize
())
{
threadwise_transfer_
.
MoveSrcSliceWindow
(
src_desc
,
step
);
}
...
...
@@ -133,8 +131,8 @@ struct BlockwiseTensorSliceTransfer_v4r1
__device__
void
MoveDstSliceWindow
(
const
DstDesc
&
dst_desc
,
const
Index
&
step
)
{
if
(
BlockSize
==
thread_cluster_desc_
.
GetElementSize
()
or
get_thread_local_1d_i
d
()
<
thread_cluster_desc_
.
GetElementSize
())
if
(
ThreadGroup
::
GetNumOfThread
()
==
thread_cluster_desc_
.
GetElementSize
()
or
ThreadGroup
::
GetThreadI
d
()
<
thread_cluster_desc_
.
GetElementSize
())
{
threadwise_transfer_
.
MoveDstSliceWindow
(
dst_desc
,
step
);
}
...
...
@@ -169,4 +167,3 @@ struct BlockwiseTensorSliceTransfer_v4r1
};
}
// namespace ck
#endif
include/ck/tensor_operation/gpu/block/
blockwise
_tensor_slice_transfer_v6r1.hpp
→
include/ck/tensor_operation/gpu/block/
thread_group
_tensor_slice_transfer_v6r1.hpp
View file @
a3b4c5cb
#ifndef CK_BLOCKWISE_TENSOR_SLICE_TRANSFER_V6R1_HPP
#define CK_BLOCKWISE_TENSOR_SLICE_TRANSFER_V6R1_HPP
#pragma once
#include "common_header.hpp"
#include "tensor_descriptor.hpp"
#include "tensor_descriptor_helper.hpp"
...
...
@@ -13,10 +11,10 @@ namespace ck {
// 1. Use StaticallyIndexedArray instead of C array for thread buffer
// 2. ThreadwiseTensorSliceTransfer_v3 does not keep reference to tensor descriptor
// 3. ThreadwiseTensorSliceTransfer_v3::Run() does not construct new tensor coordinate
template
<
index_t
BlockSize
,
template
<
typename
ThreadGroup
,
typename
ElementwiseOperation
,
InMemoryDataOperationEnum
DstInMemOp
,
typename
Block
SliceLengths
,
typename
SliceLengths
,
typename
ThreadClusterLengths
,
typename
ThreadClusterArrangeOrder
,
typename
SrcData
,
...
...
@@ -28,19 +26,19 @@ template <index_t BlockSize,
index_t
ScalarPerVector
,
bool
ThreadTransferSrcResetCoordinateAfterRun
,
bool
ThreadTransferDstResetCoordinateAfterRun
>
struct
Blockwise
TensorSliceTransfer_v6r1
struct
ThreadGroup
TensorSliceTransfer_v6r1
{
static
constexpr
index_t
nDim
=
remove_reference_t
<
SrcDesc
>::
GetNumOfDimension
();
static
constexpr
auto
thread_slice_lengths
=
Block
SliceLengths
{}
/
ThreadClusterLengths
{};
static
constexpr
auto
thread_slice_lengths
=
SliceLengths
{}
/
ThreadClusterLengths
{};
using
Index
=
MultiIndex
<
nDim
>
;
__device__
constexpr
Blockwise
TensorSliceTransfer_v6r1
(
const
SrcDesc
&
src_desc
,
const
Index
&
src_block_slice_origin
,
const
DstDesc
&
dst_desc
,
const
Index
&
dst_block_slice_origin
,
const
ElementwiseOperation
&
element_op
)
__device__
constexpr
ThreadGroup
TensorSliceTransfer_v6r1
(
const
SrcDesc
&
src_desc
,
const
Index
&
src_block_slice_origin
,
const
DstDesc
&
dst_desc
,
const
Index
&
dst_block_slice_origin
,
const
ElementwiseOperation
&
element_op
)
:
threadwise_transfer_
(
src_desc
,
make_zero_multi_index
<
nDim
>
(),
dst_desc
,
...
...
@@ -48,25 +46,25 @@ struct BlockwiseTensorSliceTransfer_v6r1
element_op
)
{
static_assert
(
nDim
==
remove_ref
erence_t
<
remove_cv
_t
<
SrcDesc
>
>
::
GetNumOfDimension
()
&&
nDim
==
remove_ref
erence_t
<
remove_cv
_t
<
DstDesc
>
>
::
GetNumOfDimension
()
&&
static_assert
(
nDim
==
remove_
cv
ref_t
<
SrcDesc
>::
GetNumOfDimension
()
&&
nDim
==
remove_
cv
ref_t
<
DstDesc
>::
GetNumOfDimension
()
&&
nDim
==
ThreadClusterLengths
::
Size
()
&&
nDim
==
ThreadClusterArrangeOrder
::
Size
()
&&
nDim
==
DimAccessOrder
::
Size
(),
"wrong! nDim not consistent"
);
static_assert
(
is_same
<
Block
SliceLengths
,
decltype
(
thread_slice_lengths
*
ThreadClusterLengths
{})
>
{},
is_same
<
SliceLengths
,
decltype
(
thread_slice_lengths
*
ThreadClusterLengths
{})
>
{},
"wrong! threads should be mapped to cover entire slicing window"
);
static_assert
(
BlockSize
>=
thread_cluster_desc_
.
GetElementSize
(),
"wrong!
BlockSize
too small"
);
static_assert
(
ThreadGroup
::
GetNumOfThread
()
>=
thread_cluster_desc_
.
GetElementSize
(),
"wrong!
ThreadGroup::GetNumOfThread()
too small"
);
if
(
BlockSize
==
thread_cluster_desc_
.
GetElementSize
()
or
get_thread_local_1d_i
d
()
<
thread_cluster_desc_
.
GetElementSize
())
if
(
ThreadGroup
::
GetNumOfThread
()
==
thread_cluster_desc_
.
GetElementSize
()
or
ThreadGroup
::
GetThreadI
d
()
<
thread_cluster_desc_
.
GetElementSize
())
{
const
auto
thread_cluster_idx
=
thread_cluster_desc_
.
CalculateBottomIndex
(
make_multi_index
(
get_thread_local_1d_i
d
()));
make_multi_index
(
ThreadGroup
::
GetThreadI
d
()));
const
auto
thread_data_idx_begin
=
thread_cluster_idx
*
thread_slice_lengths
;
...
...
@@ -83,8 +81,8 @@ struct BlockwiseTensorSliceTransfer_v6r1
const
DstDesc
&
dst_desc
,
DstBuffer
&
dst_buf
)
{
if
(
BlockSize
==
thread_cluster_desc_
.
GetElementSize
()
or
get_thread_local_1d_i
d
()
<
thread_cluster_desc_
.
GetElementSize
())
if
(
ThreadGroup
::
GetNumOfThread
()
==
thread_cluster_desc_
.
GetElementSize
()
or
ThreadGroup
::
GetThreadI
d
()
<
thread_cluster_desc_
.
GetElementSize
())
{
threadwise_transfer_
.
Run
(
src_desc
,
src_buf
,
dst_desc
,
dst_buf
);
}
...
...
@@ -92,8 +90,8 @@ struct BlockwiseTensorSliceTransfer_v6r1
__device__
void
MoveSrcSliceWindow
(
const
SrcDesc
&
src_desc
,
const
Index
&
step
)
{
if
(
BlockSize
==
thread_cluster_desc_
.
GetElementSize
()
or
get_thread_local_1d_i
d
()
<
thread_cluster_desc_
.
GetElementSize
())
if
(
ThreadGroup
::
GetNumOfThread
()
==
thread_cluster_desc_
.
GetElementSize
()
or
ThreadGroup
::
GetThreadI
d
()
<
thread_cluster_desc_
.
GetElementSize
())
{
threadwise_transfer_
.
MoveSrcSliceWindow
(
src_desc
,
step
);
}
...
...
@@ -101,8 +99,8 @@ struct BlockwiseTensorSliceTransfer_v6r1
__device__
void
MoveDstSliceWindow
(
const
DstDesc
&
dst_desc
,
const
Index
&
step
)
{
if
(
BlockSize
==
thread_cluster_desc_
.
GetElementSize
()
or
get_thread_local_1d_i
d
()
<
thread_cluster_desc_
.
GetElementSize
())
if
(
ThreadGroup
::
GetNumOfThread
()
==
thread_cluster_desc_
.
GetElementSize
()
or
ThreadGroup
::
GetThreadI
d
()
<
thread_cluster_desc_
.
GetElementSize
())
{
threadwise_transfer_
.
MoveDstSliceWindow
(
dst_desc
,
step
);
}
...
...
@@ -130,4 +128,3 @@ struct BlockwiseTensorSliceTransfer_v6r1
};
}
// namespace ck
#endif
include/ck/tensor_operation/gpu/block/
blockwise
_tensor_slice_transfer_v6r2.hpp
→
include/ck/tensor_operation/gpu/block/
thread_group
_tensor_slice_transfer_v6r2.hpp
View file @
a3b4c5cb
#ifndef CK_BLOCKWISE_TENSOR_SLICE_TRANSFER_V6R2_HPP
#define CK_BLOCKWISE_TENSOR_SLICE_TRANSFER_V6R2_HPP
#pragma once
#include "common_header.hpp"
#include "tensor_descriptor.hpp"
#include "tensor_descriptor_helper.hpp"
...
...
@@ -13,10 +11,10 @@ namespace ck {
// 1. Use StaticallyIndexedArray instead of C array for thread buffer
// 2. It does not keep reference to tensor descriptor
// 3. Run() does not construct new tensor coordinate
template
<
index_t
BlockSize
,
template
<
typename
ThreadGroup
,
typename
ElementwiseOperation
,
InMemoryDataOperationEnum
DstInMemOp
,
typename
Block
SliceLengths
,
typename
SliceLengths
,
typename
ThreadClusterLengths
,
typename
ThreadClusterArrangeOrder
,
typename
Src0Data
,
...
...
@@ -31,21 +29,21 @@ template <index_t BlockSize,
bool
ThreadTransferSrc0ResetCoordinateAfterRun
,
bool
ThreadTransferSrc1ResetCoordinateAfterRun
,
bool
ThreadTransferDstResetCoordinateAfterRun
>
struct
Blockwise
TensorSliceTransfer_v6r2
struct
ThreadGroup
TensorSliceTransfer_v6r2
{
static
constexpr
index_t
nDim
=
remove_reference_t
<
Src0Desc
>::
GetNumOfDimension
();
static
constexpr
auto
thread_slice_lengths
=
Block
SliceLengths
{}
/
ThreadClusterLengths
{};
static
constexpr
auto
thread_slice_lengths
=
SliceLengths
{}
/
ThreadClusterLengths
{};
using
Index
=
MultiIndex
<
nDim
>
;
__device__
constexpr
Blockwise
TensorSliceTransfer_v6r2
(
const
Src0Desc
&
src0_desc
,
const
Index
&
src0_block_slice_origin
,
const
Src1Desc
&
src1_desc
,
const
Index
&
src1_block_slice_origin
,
const
DstDesc
&
dst_desc
,
const
Index
&
dst_block_slice_origin
,
const
ElementwiseOperation
&
element_op
)
__device__
constexpr
ThreadGroup
TensorSliceTransfer_v6r2
(
const
Src0Desc
&
src0_desc
,
const
Index
&
src0_block_slice_origin
,
const
Src1Desc
&
src1_desc
,
const
Index
&
src1_block_slice_origin
,
const
DstDesc
&
dst_desc
,
const
Index
&
dst_block_slice_origin
,
const
ElementwiseOperation
&
element_op
)
:
threadwise_transfer_
(
src0_desc
,
make_zero_multi_index
<
nDim
>
(),
src1_desc
,
...
...
@@ -55,26 +53,26 @@ struct BlockwiseTensorSliceTransfer_v6r2
element_op
)
{
static_assert
(
nDim
==
remove_ref
erence_t
<
remove_cv
_t
<
Src0Desc
>
>
::
GetNumOfDimension
()
&&
nDim
==
remove_ref
erence_t
<
remove_cv
_t
<
Src1Desc
>
>
::
GetNumOfDimension
()
&&
nDim
==
remove_ref
erence_t
<
remove_cv
_t
<
DstDesc
>
>
::
GetNumOfDimension
()
&&
static_assert
(
nDim
==
remove_
cv
ref_t
<
Src0Desc
>::
GetNumOfDimension
()
&&
nDim
==
remove_
cv
ref_t
<
Src1Desc
>::
GetNumOfDimension
()
&&
nDim
==
remove_
cv
ref_t
<
DstDesc
>::
GetNumOfDimension
()
&&
nDim
==
ThreadClusterLengths
::
Size
()
&&
nDim
==
ThreadClusterArrangeOrder
::
Size
()
&&
nDim
==
DimAccessOrder
::
Size
(),
"wrong! nDim not consistent"
);
static_assert
(
is_same
<
Block
SliceLengths
,
decltype
(
thread_slice_lengths
*
ThreadClusterLengths
{})
>
{},
is_same
<
SliceLengths
,
decltype
(
thread_slice_lengths
*
ThreadClusterLengths
{})
>
{},
"wrong! threads should be mapped to cover entire slicing window"
);
static_assert
(
BlockSize
>=
thread_cluster_desc_
.
GetElementSize
(),
"wrong!
BlockSize
too small"
);
static_assert
(
ThreadGroup
::
GetNumOfThread
()
>=
thread_cluster_desc_
.
GetElementSize
(),
"wrong!
ThreadGroup::GetNumOfThread()
too small"
);
if
(
BlockSize
==
thread_cluster_desc_
.
GetElementSize
()
or
get_thread_local_1d_i
d
()
<
thread_cluster_desc_
.
GetElementSize
())
if
(
ThreadGroup
::
GetNumOfThread
()
==
thread_cluster_desc_
.
GetElementSize
()
or
ThreadGroup
::
GetThreadI
d
()
<
thread_cluster_desc_
.
GetElementSize
())
{
const
auto
thread_cluster_idx
=
thread_cluster_desc_
.
CalculateBottomIndex
(
make_multi_index
(
get_thread_local_1d_i
d
()));
make_multi_index
(
ThreadGroup
::
GetThreadI
d
()));
const
auto
thread_data_idx_begin
=
thread_cluster_idx
*
thread_slice_lengths
;
...
...
@@ -95,8 +93,8 @@ struct BlockwiseTensorSliceTransfer_v6r2
const
DstDesc
&
dst_desc
,
DstBuffer
&
dst_buf
)
{
if
(
BlockSize
==
thread_cluster_desc_
.
GetElementSize
()
or
get_thread_local_1d_i
d
()
<
thread_cluster_desc_
.
GetElementSize
())
if
(
ThreadGroup
::
GetNumOfThread
()
==
thread_cluster_desc_
.
GetElementSize
()
or
ThreadGroup
::
GetThreadI
d
()
<
thread_cluster_desc_
.
GetElementSize
())
{
threadwise_transfer_
.
Run
(
src0_desc
,
src0_buf
,
src1_desc
,
src1_buf
,
dst_desc
,
dst_buf
);
}
...
...
@@ -104,8 +102,8 @@ struct BlockwiseTensorSliceTransfer_v6r2
__device__
void
MoveSrc0SliceWindow
(
const
Src0Desc
&
src0_desc
,
const
Index
&
step
)
{
if
(
BlockSize
==
thread_cluster_desc_
.
GetElementSize
()
or
get_thread_local_1d_i
d
()
<
thread_cluster_desc_
.
GetElementSize
())
if
(
ThreadGroup
::
GetNumOfThread
()
==
thread_cluster_desc_
.
GetElementSize
()
or
ThreadGroup
::
GetThreadI
d
()
<
thread_cluster_desc_
.
GetElementSize
())
{
threadwise_transfer_
.
MoveSrc0SliceWindow
(
src0_desc
,
step
);
}
...
...
@@ -113,8 +111,8 @@ struct BlockwiseTensorSliceTransfer_v6r2
__device__
void
MoveSrc1SliceWindow
(
const
Src1Desc
&
src1_desc
,
const
Index
&
step
)
{
if
(
BlockSize
==
thread_cluster_desc_
.
GetElementSize
()
or
get_thread_local_1d_i
d
()
<
thread_cluster_desc_
.
GetElementSize
())
if
(
ThreadGroup
::
GetNumOfThread
()
==
thread_cluster_desc_
.
GetElementSize
()
or
ThreadGroup
::
GetThreadI
d
()
<
thread_cluster_desc_
.
GetElementSize
())
{
threadwise_transfer_
.
MoveSrc1SliceWindow
(
src1_desc
,
step
);
}
...
...
@@ -122,8 +120,8 @@ struct BlockwiseTensorSliceTransfer_v6r2
__device__
void
MoveDstSliceWindow
(
const
DstDesc
&
dst_desc
,
const
Index
&
step
)
{
if
(
BlockSize
==
thread_cluster_desc_
.
GetElementSize
()
or
get_thread_local_1d_i
d
()
<
thread_cluster_desc_
.
GetElementSize
())
if
(
ThreadGroup
::
GetNumOfThread
()
==
thread_cluster_desc_
.
GetElementSize
()
or
ThreadGroup
::
GetThreadI
d
()
<
thread_cluster_desc_
.
GetElementSize
())
{
threadwise_transfer_
.
MoveDstSliceWindow
(
dst_desc
,
step
);
}
...
...
@@ -154,4 +152,3 @@ struct BlockwiseTensorSliceTransfer_v6r2
};
}
// namespace ck
#endif
include/ck/tensor_operation/gpu/block/
blockwise
_tensor_slice_transfer_v6r3.hpp
→
include/ck/tensor_operation/gpu/block/
thread_group
_tensor_slice_transfer_v6r3.hpp
View file @
a3b4c5cb
#ifndef CK_BLOCKWISE_TENSOR_SLICE_TRANSFER_V6R3_HPP
#define CK_BLOCKWISE_TENSOR_SLICE_TRANSFER_V6R3_HPP
#pragma once
#include "common_header.hpp"
#include "tensor_descriptor.hpp"
#include "tensor_descriptor_helper.hpp"
...
...
@@ -13,10 +11,10 @@ namespace ck {
// 1. Use StaticallyIndexedArray instead of C array for thread buffer
// 2. ThreadwiseTensorSliceTransfer_v3 does not keep reference to tensor descriptor
// 3. ThreadwiseTensorSliceTransfer_v3::Run() does not construct new tensor coordinate
template
<
index_t
BlockSize
,
template
<
typename
ThreadGroup
,
typename
ElementwiseOperation
,
InMemoryDataOperationEnum
DstInMemOp
,
typename
Block
SliceLengths
,
typename
SliceLengths
,
typename
ThreadClusterLengths
,
typename
ThreadClusterArrangeOrder
,
typename
Src0Data
,
...
...
@@ -34,23 +32,23 @@ template <index_t BlockSize,
bool
ThreadTransferSrc1ResetCoordinateAfterRun
,
bool
ThreadTransferSrc2ResetCoordinateAfterRun
,
bool
ThreadTransferDstResetCoordinateAfterRun
>
struct
Blockwise
TensorSliceTransfer_v6r3
struct
ThreadGroup
TensorSliceTransfer_v6r3
{
static
constexpr
index_t
nDim
=
remove_reference_t
<
Src0Desc
>::
GetNumOfDimension
();
static
constexpr
auto
thread_slice_lengths
=
Block
SliceLengths
{}
/
ThreadClusterLengths
{};
static
constexpr
auto
thread_slice_lengths
=
SliceLengths
{}
/
ThreadClusterLengths
{};
using
Index
=
MultiIndex
<
nDim
>
;
__device__
constexpr
Blockwise
TensorSliceTransfer_v6r3
(
const
Src0Desc
&
src0_desc
,
const
Index
&
src0_block_slice_origin
,
const
Src1Desc
&
src1_desc
,
const
Index
&
src1_block_slice_origin
,
const
Src2Desc
&
src2_desc
,
const
Index
&
src2_block_slice_origin
,
const
DstDesc
&
dst_desc
,
const
Index
&
dst_block_slice_origin
,
const
ElementwiseOperation
&
element_op
)
__device__
constexpr
ThreadGroup
TensorSliceTransfer_v6r3
(
const
Src0Desc
&
src0_desc
,
const
Index
&
src0_block_slice_origin
,
const
Src1Desc
&
src1_desc
,
const
Index
&
src1_block_slice_origin
,
const
Src2Desc
&
src2_desc
,
const
Index
&
src2_block_slice_origin
,
const
DstDesc
&
dst_desc
,
const
Index
&
dst_block_slice_origin
,
const
ElementwiseOperation
&
element_op
)
:
threadwise_transfer_
(
src0_desc
,
make_zero_multi_index
<
nDim
>
(),
src1_desc
,
...
...
@@ -62,24 +60,24 @@ struct BlockwiseTensorSliceTransfer_v6r3
element_op
)
{
static_assert
(
nDim
==
remove_ref
erence_t
<
remove_cv
_t
<
Src0Desc
>
>
::
GetNumOfDimension
()
&&
nDim
==
remove_ref
erence_t
<
remove_cv
_t
<
Src1Desc
>
>
::
GetNumOfDimension
()
&&
nDim
==
remove_ref
erence_t
<
remove_cv
_t
<
Src2Desc
>
>
::
GetNumOfDimension
()
&&
nDim
==
remove_ref
erence_t
<
remove_cv
_t
<
DstDesc
>
>
::
GetNumOfDimension
()
&&
static_assert
(
nDim
==
remove_
cv
ref_t
<
Src0Desc
>::
GetNumOfDimension
()
&&
nDim
==
remove_
cv
ref_t
<
Src1Desc
>::
GetNumOfDimension
()
&&
nDim
==
remove_
cv
ref_t
<
Src2Desc
>::
GetNumOfDimension
()
&&
nDim
==
remove_
cv
ref_t
<
DstDesc
>::
GetNumOfDimension
()
&&
nDim
==
ThreadClusterLengths
::
Size
()
&&
nDim
==
ThreadClusterArrangeOrder
::
Size
()
&&
nDim
==
DimAccessOrder
::
Size
(),
"wrong! nDim not consistent"
);
static_assert
(
is_same
<
Block
SliceLengths
,
decltype
(
thread_slice_lengths
*
ThreadClusterLengths
{})
>
{},
is_same
<
SliceLengths
,
decltype
(
thread_slice_lengths
*
ThreadClusterLengths
{})
>
{},
"wrong! threads should be mapped to cover entire slicing window"
);
static_assert
(
BlockSize
>=
thread_cluster_desc_
.
GetElementSize
(),
"wrong!
BlockSize
too small"
);
static_assert
(
ThreadGroup
::
GetNumOfThread
()
>=
thread_cluster_desc_
.
GetElementSize
(),
"wrong!
ThreadGroup::GetNumOfThread()
too small"
);
if
(
BlockSize
==
thread_cluster_desc_
.
GetElementSize
()
or
get_thread_local_1d_i
d
()
<
thread_cluster_desc_
.
GetElementSize
())
if
(
ThreadGroup
::
GetNumOfThread
()
==
thread_cluster_desc_
.
GetElementSize
()
or
ThreadGroup
::
GetThreadI
d
()
<
thread_cluster_desc_
.
GetElementSize
())
{
const
auto
thread_cluster_idx
=
thread_cluster_desc_
.
CalculateBottomIndex
(
make_multi_index
(
get_thread_local_1d_id
()));
...
...
@@ -107,8 +105,8 @@ struct BlockwiseTensorSliceTransfer_v6r3
const
DstDesc
&
dst_desc
,
DstBuffer
&
dst_buf
)
{
if
(
BlockSize
==
thread_cluster_desc_
.
GetElementSize
()
or
get_thread_local_1d_i
d
()
<
thread_cluster_desc_
.
GetElementSize
())
if
(
ThreadGroup
::
GetNumOfThread
()
==
thread_cluster_desc_
.
GetElementSize
()
or
ThreadGroup
::
GetThreadI
d
()
<
thread_cluster_desc_
.
GetElementSize
())
{
threadwise_transfer_
.
Run
(
src0_desc
,
src0_buf
,
src1_desc
,
src1_buf
,
src2_desc
,
src2_buf
,
dst_desc
,
dst_buf
);
...
...
@@ -117,8 +115,8 @@ struct BlockwiseTensorSliceTransfer_v6r3
__device__
void
MoveSrc0SliceWindow
(
const
Src0Desc
&
src0_desc
,
const
Index
&
step
)
{
if
(
BlockSize
==
thread_cluster_desc_
.
GetElementSize
()
or
get_thread_local_1d_i
d
()
<
thread_cluster_desc_
.
GetElementSize
())
if
(
ThreadGroup
::
GetNumOfThread
()
==
thread_cluster_desc_
.
GetElementSize
()
or
ThreadGroup
::
GetThreadI
d
()
<
thread_cluster_desc_
.
GetElementSize
())
{
threadwise_transfer_
.
MoveSrc0SliceWindow
(
src0_desc
,
step
);
}
...
...
@@ -126,8 +124,8 @@ struct BlockwiseTensorSliceTransfer_v6r3
__device__
void
MoveSrc1SliceWindow
(
const
Src1Desc
&
src1_desc
,
const
Index
&
step
)
{
if
(
BlockSize
==
thread_cluster_desc_
.
GetElementSize
()
or
get_thread_local_1d_i
d
()
<
thread_cluster_desc_
.
GetElementSize
())
if
(
ThreadGroup
::
GetNumOfThread
()
==
thread_cluster_desc_
.
GetElementSize
()
or
ThreadGroup
::
GetThreadI
d
()
<
thread_cluster_desc_
.
GetElementSize
())
{
threadwise_transfer_
.
MoveSrc1SliceWindow
(
src1_desc
,
step
);
}
...
...
@@ -135,8 +133,8 @@ struct BlockwiseTensorSliceTransfer_v6r3
__device__
void
MoveSrc2SliceWindow
(
const
Src2Desc
&
src2_desc
,
const
Index
&
step
)
{
if
(
BlockSize
==
thread_cluster_desc_
.
GetElementSize
()
or
get_thread_local_1d_i
d
()
<
thread_cluster_desc_
.
GetElementSize
())
if
(
ThreadGroup
::
GetNumOfThread
()
==
thread_cluster_desc_
.
GetElementSize
()
or
ThreadGroup
::
GetThreadI
d
()
<
thread_cluster_desc_
.
GetElementSize
())
{
threadwise_transfer_
.
MoveSrc2SliceWindow
(
src2_desc
,
step
);
}
...
...
@@ -144,8 +142,8 @@ struct BlockwiseTensorSliceTransfer_v6r3
__device__
void
MoveDstSliceWindow
(
const
DstDesc
&
dst_desc
,
const
Index
&
step
)
{
if
(
BlockSize
==
thread_cluster_desc_
.
GetElementSize
()
or
get_thread_local_1d_i
d
()
<
thread_cluster_desc_
.
GetElementSize
())
if
(
ThreadGroup
::
GetNumOfThread
()
==
thread_cluster_desc_
.
GetElementSize
()
or
ThreadGroup
::
GetThreadI
d
()
<
thread_cluster_desc_
.
GetElementSize
())
{
threadwise_transfer_
.
MoveDstSliceWindow
(
dst_desc
,
step
);
}
...
...
@@ -179,4 +177,3 @@ struct BlockwiseTensorSliceTransfer_v6r3
};
}
// namespace ck
#endif
include/ck/tensor_operation/gpu/device/convolution_backward_weight_specialization.hpp
0 → 100644
View file @
a3b4c5cb
#pragma once
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
enum
struct
ConvolutionBackwardWeightSpecialization
{
Default
,
Filter1x1Stride1Pad0
,
Filter1x1Pad0
,
OddC
,
};
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
include/ck/tensor_operation/gpu/device/device_5ary_elementwise.hpp
0 → 100644
View file @
a3b4c5cb
#pragma once
#include <iostream>
#include <sstream>
#include "device.hpp"
#include "device_base.hpp"
#include "common_header.hpp"
#include "gridwise_5ary_Elementwise_1d.hpp"
#include "tensor_layout.hpp"
#include "tensor_descriptor.hpp"
#include "tensor_descriptor_helper.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
template
<
typename
ADataType
,
typename
BDataType
,
typename
CDataType
,
typename
DDataType
,
typename
EDataType
,
typename
FDataType
,
typename
ComputeDataType
,
typename
ElementwiseFunctor
,
index_t
NDim
,
index_t
MPerThread
,
index_t
AScalarPerVector
,
index_t
BScalarPerVector
,
index_t
CScalarPerVector
,
index_t
DScalarPerVector
,
index_t
EScalarPerVector
,
index_t
FScalarPerVector
>
struct
Device5AryElementwise
:
public
BaseOperator
{
static
constexpr
auto
I0
=
Number
<
0
>
{};
template
<
typename
Desc_M
>
static
auto
PadDescriptor_M_1d
(
Desc_M
desc_m
,
index_t
gridSize
,
index_t
blockSize
)
{
const
auto
m
=
desc_m
.
GetLength
(
I0
);
const
index_t
loop_step
=
gridSize
*
blockSize
*
MPerThread
;
const
auto
pad
=
math
::
integer_least_multiple
(
m
,
loop_step
)
-
m
;
const
auto
desc_m_pad
=
transform_tensor_descriptor
(
desc_m
,
make_tuple
(
make_right_pad_transform
(
m
,
pad
)),
make_tuple
(
Sequence
<
0
>
{}),
make_tuple
(
Sequence
<
0
>
{}));
return
desc_m_pad
;
}
static
auto
MakeDescriptor_M
(
const
std
::
vector
<
index_t
>&
lengths
,
const
std
::
vector
<
index_t
>&
stride
,
index_t
gridSize
,
index_t
blockSize
)
{
auto
tupleOfShape
=
generate_tuple
([
&
](
auto
I
)
{
return
lengths
[
I
];
},
Number
<
NDim
>
{});
auto
tupleOfStride
=
generate_tuple
([
&
](
auto
I
)
{
return
stride
[
I
];
},
Number
<
NDim
>
{});
// nd desc - [s0, s1, s2, ...]
const
auto
desc
=
make_naive_tensor_descriptor
(
tupleOfShape
,
tupleOfStride
);
// merge nd to 1d desc - [s0 * s1 * ...]
if
constexpr
(
NDim
>
1
)
{
const
auto
desc_m
=
transform_tensor_descriptor
(
desc
,
make_tuple
(
make_merge_transform
(
tupleOfShape
)),
make_tuple
(
generate_sequence_v2
([
&
](
auto
I
)
{
return
I
;
},
Number
<
NDim
>
{})),
make_tuple
(
Sequence
<
0
>
{}));
return
PadDescriptor_M_1d
(
desc_m
,
gridSize
,
blockSize
);
}
else
return
PadDescriptor_M_1d
(
desc
,
gridSize
,
blockSize
);
}
using
AGridDesc_M
=
decltype
(
MakeDescriptor_M
({
1
,
1
},
{
1
,
1
},
1
,
1
));
using
BGridDesc_M
=
decltype
(
MakeDescriptor_M
({
1
,
1
},
{
1
,
1
},
1
,
1
));
using
CGridDesc_M
=
decltype
(
MakeDescriptor_M
({
1
,
1
},
{
1
,
1
},
1
,
1
));
using
DGridDesc_M
=
decltype
(
MakeDescriptor_M
({
1
,
1
},
{
1
,
1
},
1
,
1
));
using
EGridDesc_M
=
decltype
(
MakeDescriptor_M
({
1
,
1
},
{
1
,
1
},
1
,
1
));
using
FGridDesc_M
=
decltype
(
MakeDescriptor_M
({
1
,
1
},
{
1
,
1
},
1
,
1
));
using
Gridwise5AryEltwise
=
Gridwise5AryElementwise_1D
<
ADataType
,
BDataType
,
CDataType
,
DDataType
,
EDataType
,
FDataType
,
ComputeDataType
,
AGridDesc_M
,
BGridDesc_M
,
CGridDesc_M
,
DGridDesc_M
,
EGridDesc_M
,
FGridDesc_M
,
ElementwiseFunctor
,
MPerThread
,
AScalarPerVector
,
BScalarPerVector
,
CScalarPerVector
,
DScalarPerVector
,
EScalarPerVector
,
FScalarPerVector
>
;
struct
Argument
:
public
BaseArgument
{
Argument
(
const
ADataType
*
p_a
,
const
BDataType
*
p_b
,
const
CDataType
*
p_c
,
const
DDataType
*
p_d
,
const
EDataType
*
p_e
,
FDataType
*
p_f
,
const
std
::
vector
<
index_t
>&
lengths
,
const
std
::
vector
<
index_t
>&
a_strides
,
const
std
::
vector
<
index_t
>&
b_strides
,
const
std
::
vector
<
index_t
>&
c_strides
,
const
std
::
vector
<
index_t
>&
d_strides
,
const
std
::
vector
<
index_t
>&
e_strides
,
const
std
::
vector
<
index_t
>&
f_strides
,
ElementwiseFunctor
functor
)
:
p_a_
(
p_a
),
p_b_
(
p_b
),
p_c_
(
p_c
),
p_d_
(
p_d
),
p_e_
(
p_e
),
p_f_
(
p_f
),
lengths_
(
lengths
),
a_strides_
(
a_strides
),
b_strides_
(
b_strides
),
c_strides_
(
c_strides
),
d_strides_
(
d_strides
),
e_strides_
(
e_strides
),
f_strides_
(
f_strides
),
functor_
(
functor
),
blockSize_
(
256
),
gridSize_
(
120
)
// FIXME - Calculate the grid size by number of CU in the future
{
a_grid_desc_m_
=
MakeDescriptor_M
(
lengths
,
a_strides
,
gridSize_
,
blockSize_
);
b_grid_desc_m_
=
MakeDescriptor_M
(
lengths
,
b_strides
,
gridSize_
,
blockSize_
);
c_grid_desc_m_
=
MakeDescriptor_M
(
lengths
,
c_strides
,
gridSize_
,
blockSize_
);
d_grid_desc_m_
=
MakeDescriptor_M
(
lengths
,
d_strides
,
gridSize_
,
blockSize_
);
e_grid_desc_m_
=
MakeDescriptor_M
(
lengths
,
e_strides
,
gridSize_
,
blockSize_
);
f_grid_desc_m_
=
MakeDescriptor_M
(
lengths
,
f_strides
,
gridSize_
,
blockSize_
);
}
const
ADataType
*
p_a_
;
const
BDataType
*
p_b_
;
const
CDataType
*
p_c_
;
const
DDataType
*
p_d_
;
const
EDataType
*
p_e_
;
FDataType
*
p_f_
;
std
::
vector
<
index_t
>
lengths_
;
AGridDesc_M
a_grid_desc_m_
;
BGridDesc_M
b_grid_desc_m_
;
CGridDesc_M
c_grid_desc_m_
;
DGridDesc_M
d_grid_desc_m_
;
EGridDesc_M
e_grid_desc_m_
;
FGridDesc_M
f_grid_desc_m_
;
std
::
vector
<
index_t
>
a_strides_
;
std
::
vector
<
index_t
>
b_strides_
;
std
::
vector
<
index_t
>
c_strides_
;
std
::
vector
<
index_t
>
d_strides_
;
std
::
vector
<
index_t
>
e_strides_
;
std
::
vector
<
index_t
>
f_strides_
;
ElementwiseFunctor
functor_
;
index_t
blockSize_
;
index_t
gridSize_
;
};
struct
Invoker
:
public
BaseInvoker
{
float
Run
(
const
Argument
&
arg
,
const
StreamConfig
&
stream_config
=
StreamConfig
{})
{
const
auto
kernel
=
kernel_5ary_elementwise_1d
<
Gridwise5AryEltwise
,
ADataType
,
BDataType
,
CDataType
,
DDataType
,
EDataType
,
FDataType
,
AGridDesc_M
,
BGridDesc_M
,
CGridDesc_M
,
DGridDesc_M
,
EGridDesc_M
,
FGridDesc_M
,
ElementwiseFunctor
>
;
float
elapsed_time
=
launch_and_time_kernel
(
stream_config
,
kernel
,
dim3
(
arg
.
gridSize_
),
dim3
(
arg
.
blockSize_
),
0
,
arg
.
p_a_
,
arg
.
p_b_
,
arg
.
p_c_
,
arg
.
p_d_
,
arg
.
p_e_
,
arg
.
p_f_
,
arg
.
a_grid_desc_m_
,
arg
.
b_grid_desc_m_
,
arg
.
c_grid_desc_m_
,
arg
.
d_grid_desc_m_
,
arg
.
e_grid_desc_m_
,
arg
.
f_grid_desc_m_
,
arg
.
functor_
);
return
elapsed_time
;
}
// polymorphic
float
Run
(
const
BaseArgument
*
p_arg
,
const
StreamConfig
&
stream_config
=
StreamConfig
{})
override
{
return
Run
(
*
dynamic_cast
<
const
Argument
*>
(
p_arg
),
stream_config
);
}
};
bool
IsSupportedArgument
(
const
BaseArgument
&
p_arg
)
{
return
IsSupportedArgument
(
&
p_arg
);
}
bool
IsSupportedArgument
(
const
BaseArgument
*
p_arg
)
override
{
const
Argument
*
pArg
=
dynamic_cast
<
const
Argument
*>
(
p_arg
);
if
(
pArg
==
nullptr
)
return
false
;
if
(
pArg
->
lengths_
.
size
()
!=
NDim
)
return
false
;
if
(
pArg
->
lengths_
.
back
()
%
MPerThread
!=
0
)
return
false
;
auto
IsScalarPerVectorValid
=
[](
bool
isLastDimensionCoalesced
,
int
scalarPerVector
)
{
bool
ret
=
true
;
if
(
!
isLastDimensionCoalesced
)
ret
=
scalarPerVector
==
1
;
else
ret
=
MPerThread
%
scalarPerVector
==
0
;
return
ret
;
};
if
(
!
IsScalarPerVectorValid
(
pArg
->
a_strides_
.
back
()
==
1
,
AScalarPerVector
))
return
false
;
if
(
!
IsScalarPerVectorValid
(
pArg
->
b_strides_
.
back
()
==
1
,
BScalarPerVector
))
return
false
;
if
(
!
IsScalarPerVectorValid
(
pArg
->
c_strides_
.
back
()
==
1
,
CScalarPerVector
))
return
false
;
if
(
!
IsScalarPerVectorValid
(
pArg
->
d_strides_
.
back
()
==
1
,
DScalarPerVector
))
return
false
;
if
(
!
IsScalarPerVectorValid
(
pArg
->
e_strides_
.
back
()
==
1
,
EScalarPerVector
))
return
false
;
if
(
!
IsScalarPerVectorValid
(
pArg
->
f_strides_
.
back
()
==
1
,
FScalarPerVector
))
return
false
;
return
true
;
};
static
auto
MakeArgument
(
const
ADataType
*
p_a
,
const
BDataType
*
p_b
,
const
CDataType
*
p_c
,
const
DDataType
*
p_d
,
const
EDataType
*
p_e
,
FDataType
*
p_f
,
std
::
vector
<
index_t
>
lengths
,
std
::
vector
<
index_t
>
a_strides
,
std
::
vector
<
index_t
>
b_strides
,
std
::
vector
<
index_t
>
c_strides
,
std
::
vector
<
index_t
>
d_strides
,
std
::
vector
<
index_t
>
e_strides
,
std
::
vector
<
index_t
>
f_strides
,
ElementwiseFunctor
functor
)
{
return
Argument
{
p_a
,
p_b
,
p_c
,
p_d
,
p_e
,
p_f
,
lengths
,
a_strides
,
b_strides
,
c_strides
,
d_strides
,
e_strides
,
f_strides
,
functor
};
}
std
::
unique_ptr
<
BaseArgument
>
MakeArgumentPointer
(
const
void
*
p_a
,
const
void
*
p_b
,
const
void
*
p_c
,
const
void
*
p_d
,
const
void
*
p_e
,
void
*
p_f
,
std
::
vector
<
index_t
>
lengths
,
std
::
vector
<
index_t
>
a_strides
,
std
::
vector
<
index_t
>
b_strides
,
std
::
vector
<
index_t
>
c_strides
,
std
::
vector
<
index_t
>
d_strides
,
std
::
vector
<
index_t
>
e_strides
,
std
::
vector
<
index_t
>
f_strides
,
ElementwiseFunctor
functor
)
{
return
std
::
make_unique
<
Argument
>
(
static_cast
<
const
ADataType
*>
(
p_a
),
static_cast
<
const
BDataType
*>
(
p_b
),
static_cast
<
const
CDataType
*>
(
p_c
),
static_cast
<
const
DDataType
*>
(
p_d
),
static_cast
<
const
EDataType
*>
(
p_e
),
static_cast
<
FDataType
*>
(
p_f
),
lengths
,
a_strides
,
b_strides
,
c_strides
,
d_strides
,
e_strides
,
f_strides
,
functor
);
}
static
auto
MakeInvoker
()
{
return
Invoker
{};
}
std
::
unique_ptr
<
BaseInvoker
>
MakeInvokerPointer
()
{
return
std
::
make_unique
<
Invoker
>
();
}
};
// namespace device
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
include/ck/tensor_operation/gpu/device/device_base.hpp
View file @
a3b4c5cb
#ifndef DEVICE_BASE_HPP
#define DEVICE_BASE_HPP
#pragma once
#include <string>
#include "stream_config.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
...
...
@@ -22,7 +23,10 @@ struct BaseInvoker
BaseInvoker
(
const
BaseInvoker
&
)
=
default
;
BaseInvoker
&
operator
=
(
const
BaseInvoker
&
)
=
default
;
virtual
float
Run
(
const
BaseArgument
*
,
int
=
1
)
=
0
;
virtual
float
Run
(
const
BaseArgument
*
,
const
StreamConfig
&
=
StreamConfig
{})
{
return
float
{
0
};
}
virtual
~
BaseInvoker
()
{}
};
...
...
@@ -33,8 +37,12 @@ struct BaseOperator
BaseOperator
(
const
BaseOperator
&
)
=
default
;
BaseOperator
&
operator
=
(
const
BaseOperator
&
)
=
default
;
virtual
bool
IsSupportedArgument
(
const
BaseArgument
*
)
=
0
;
virtual
std
::
string
GetTypeString
()
const
=
0
;
virtual
bool
IsSupportedArgument
(
const
BaseArgument
*
)
{
return
false
;
}
virtual
std
::
string
GetTypeString
()
const
{
return
""
;
}
virtual
size_t
GetWorkSpaceSize
(
const
BaseArgument
*
)
const
{
return
0
;
}
virtual
void
SetWorkSpacePointer
(
BaseArgument
*
,
void
*
)
const
{}
virtual
~
BaseOperator
()
{}
};
...
...
@@ -42,4 +50,3 @@ struct BaseOperator
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
#endif
include/ck/tensor_operation/gpu/device/device_batched_gemm_reduce_xdl_cshuffle.hpp
View file @
a3b4c5cb
...
...
@@ -17,12 +17,12 @@ namespace device {
template
<
typename
GridwiseGemm
,
typename
FloatAB
,
typename
FloatC
,
typename
FloatD
,
typename
DPtrsGlobal
,
typename
AElementwiseOperation
,
typename
BElementwiseOperation
,
typename
CElementwiseOperation
,
typename
D
0Reduc
eOperation
,
typename
D
1Reduc
eOperation
,
typename
D
xsInElementwis
eOperation
,
typename
D
xsAccElementwis
eOperation
,
typename
AGridDesc_AK0_M_AK1
,
typename
BGridDesc_BK0_N_BK1
,
typename
CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
,
...
...
@@ -38,14 +38,13 @@ __global__ void
const
FloatAB
*
__restrict__
p_a_grid
,
const
FloatAB
*
__restrict__
p_b_grid
,
FloatC
*
__restrict__
p_c_grid
,
FloatD
*
__restrict__
p_d0_grid
,
FloatD
*
__restrict__
p_d1_grid
,
DPtrsGlobal
p_ds_grid
,
const
index_t
batch_count
,
const
AElementwiseOperation
a_element_op
,
const
BElementwiseOperation
b_element_op
,
const
CElementwiseOperation
c_element_op
,
const
D
0ReduceOperation
d0_reduce
_op
,
const
D
1ReduceOperation
d1_reduce
_op
,
const
D
xsInElementwiseOperation
dxs_in_element
_op
,
const
D
xsAccElementwiseOperation
dxs_out_element
_op
,
const
AGridDesc_AK0_M_AK1
a_grid_desc_ak0_m_ak1
,
const
BGridDesc_BK0_N_BK1
b_grid_desc_bk0_n_bk1
,
const
CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
...
...
@@ -66,24 +65,24 @@ __global__ void
const
long_index_t
c_batch_offset
=
__builtin_amdgcn_readfirstlane
(
static_cast
<
long_index_t
>
(
compute_base_ptr_of_batch_
.
GetCBasePtr
(
g_idx
)));
const
long_index_t
d0_batch_offset
=
__builtin_amdgcn_readfirstlane
(
static_cast
<
long_index_t
>
(
compute_base_ptr_of_batch_
.
GetD0BasePtr
(
g_idx
)));
const
long_index_t
d1_batch_offset
=
__builtin_amdgcn_readfirstlane
(
static_cast
<
long_index_t
>
(
compute_base_ptr_of_batch_
.
GetD1BasePtr
(
g_idx
)));
static_for
<
0
,
p_ds_grid
.
Size
(),
1
>
{}([
&
](
auto
In
)
{
const
long_index_t
d_batch_offset
=
__builtin_amdgcn_readfirstlane
(
static_cast
<
long_index_t
>
(
compute_base_ptr_of_batch_
.
GetDBasePtr
(
g_idx
,
In
)));
p_ds_grid
(
In
)
=
p_ds_grid
(
In
)
+
d_batch_offset
;
});
__shared__
char
p_shared
[
GridwiseGemm
::
GetSharedMemoryNumberOfByte
()];
GridwiseGemm
::
template
Run
<
HasMainK0BlockLoop
>(
p_a_grid
+
a_batch_offset
,
p_b_grid
+
b_batch_offset
,
p_c_grid
+
c_batch_offset
,
p_d0_grid
+
d0_batch_offset
,
p_d1_grid
+
d1_batch_offset
,
p_ds_grid
,
p_shared
,
a_element_op
,
b_element_op
,
c_element_op
,
d
0_reduce
_op
,
d
1_reduce
_op
,
d
xs_in_element
_op
,
d
xs_out_element
_op
,
a_grid_desc_ak0_m_ak1
,
b_grid_desc_bk0_n_bk1
,
c_grid_desc_mblock_mperblock_nblock_nperblock
,
...
...
@@ -93,14 +92,13 @@ __global__ void
ignore
=
p_a_grid
;
ignore
=
p_b_grid
;
ignore
=
p_c_grid
;
ignore
=
p_d0_grid
;
ignore
=
p_d1_grid
;
ignore
=
p_ds_grid
;
ignore
=
batch_count
;
ignore
=
a_element_op
;
ignore
=
b_element_op
;
ignore
=
c_element_op
;
ignore
=
d
0_reduce
_op
;
ignore
=
d
1_reduce
_op
;
ignore
=
d
xs_in_element
_op
;
ignore
=
d
xs_out_element
_op
;
ignore
=
a_grid_desc_ak0_m_ak1
;
ignore
=
b_grid_desc_bk0_n_bk1
;
ignore
=
c_grid_desc_mblock_mperblock_nblock_nperblock
;
...
...
@@ -110,6 +108,9 @@ __global__ void
#endif // end of if defined (defined(__gfx908__) || defined(__gfx90a__))
}
// Note: inter-wave loop scheduler is rolled out to c-shuffle version first. Becuase non c-shuffle
// version currently has compiler issues with register spill which further causes validation
// failures.
template
<
typename
ALayout
,
typename
BLayout
,
typename
CLayout
,
...
...
@@ -119,12 +120,14 @@ template <typename ALayout,
typename
GemmAccDataType
,
typename
CShuffleDataType
,
typename
ReduceAccDataType
,
typename
D
DataType
,
typename
D
PtrsGlobal
,
typename
AElementwiseOperation
,
typename
BElementwiseOperation
,
typename
CElementwiseOperation
,
typename
D0ReduceOperation
,
typename
D1ReduceOperation
,
typename
DxsReduceOperation
,
typename
DxsInElementwiseOperation
,
typename
DxsAccElementwiseOperation
,
typename
DGlobalMemoryDataOperation
,
GemmSpecialization
GemmSpec
,
index_t
NumGemmKPrefetchStage
,
index_t
BlockSize
,
...
...
@@ -157,12 +160,14 @@ template <typename ALayout,
index_t
CShuffleBlockTransferScalarPerVector_NPerBlock
,
typename
CReduceThreadClusterLengths_MPerBlock_NPerBlock
,
index_t
CReduceThreadLds2VGprCopySrcDstScalarPerVector_NPerBlock
,
index_t
CReduceThreadVgpr2GlobalCopySrcDstScalarPerVector_MPerBlock
>
struct
DeviceBatchedGemmReduce_Xdl_CShuffle
:
public
DeviceGemmReduce
<
AElementwiseOperation
,
index_t
CReduceThreadVgpr2GlobalCopySrcDstScalarPerVector_MPerBlock
,
LoopScheduler
LoopSched
=
make_default_loop_scheduler
()>
struct
DeviceBatchedGemmReduce_Xdl_CShuffle
:
public
DeviceGemmReduce
<
DPtrsGlobal
,
AElementwiseOperation
,
BElementwiseOperation
,
CElementwiseOperation
,
D
0Reduc
eOperation
,
D
1Reduc
eOperation
>
D
xsInElementwis
eOperation
,
D
xsAccElementwis
eOperation
>
{
using
DeviceOp
=
DeviceBatchedGemmReduce_Xdl_CShuffle
;
...
...
@@ -465,56 +470,16 @@ struct DeviceBatchedGemmReduce_Xdl_CShuffle : public DeviceGemmReduce<AElementwi
using
CGridDesc_M_N
=
decltype
(
MakeCGridDescriptor_M_N
(
1
,
1
,
1
));
using
DGridDesc_M
=
decltype
(
MakeDGridDescriptor_M
(
1
));
static
constexpr
auto
MakeBlock2CTileMap
(
index_t
batch_count
,
const
CGridDesc_M_N
&
c_grid_desc_m_n
,
index_t
M01
,
index_t
N01
)
{
const
auto
M
=
c_grid_desc_m_n
.
GetLength
(
I0
);
const
auto
N
=
c_grid_desc_m_n
.
GetLength
(
I1
);
constexpr
auto
M1
=
Number
<
MPerBlock
>
{};
constexpr
auto
N1
=
Number
<
NPerBlock
>
{};
const
auto
M0
=
M
/
M1
;
const
auto
N0
=
N
/
N1
;
const
auto
M00
=
M0
/
M01
;
const
auto
N00
=
N0
/
N01
;
const
auto
g_m00_m01_n00_n01_to_m0_n0_block_cluster_adaptor
=
make_single_stage_tensor_adaptor
(
make_tuple
(
make_insert_transform
(
batch_count
),
make_unmerge_transform
(
make_tuple
(
M00
,
M01
)),
make_unmerge_transform
(
make_tuple
(
N00
,
N01
))),
make_tuple
(
Sequence
<>
{},
Sequence
<
0
>
{},
Sequence
<
1
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
,
3
>
{},
Sequence
<
2
,
4
>
{}));
const
auto
globalblockid_to_m00_m01_n00_n01_block_cluster_adaptor
=
make_single_stage_tensor_adaptor
(
make_tuple
(
make_merge_transform
(
make_tuple
(
batch_count
,
M00
,
N00
,
M01
,
N01
))),
make_tuple
(
Sequence
<
0
,
1
,
2
,
3
,
4
>
{}),
make_tuple
(
Sequence
<
0
>
{}));
const
auto
globalblockid_to_m0_n0_block_cluster_adaptor
=
chain_tensor_adaptors
(
g_m00_m01_n00_n01_to_m0_n0_block_cluster_adaptor
,
globalblockid_to_m00_m01_n00_n01_block_cluster_adaptor
);
return
globalblockid_to_m0_n0_block_cluster_adaptor
;
}
struct
ComputeBasePtrOfStridedBatch
{
ComputeBasePtrOfStridedBatch
(
index_t
BatchStrideA
,
index_t
BatchStrideB
,
index_t
BatchStrideC
,
index_t
BatchStrideD0
,
index_t
BatchStrideD1
)
index_t
BatchStrideD
)
:
BatchStrideA_
(
BatchStrideA
),
BatchStrideB_
(
BatchStrideB
),
BatchStrideC_
(
BatchStrideC
),
BatchStrideD0_
(
BatchStrideD0
),
BatchStrideD1_
(
BatchStrideD1
)
BatchStrideD_
(
BatchStrideD
)
{
}
...
...
@@ -533,22 +498,20 @@ struct DeviceBatchedGemmReduce_Xdl_CShuffle : public DeviceGemmReduce<AElementwi
return
g_idx
*
static_cast
<
long_index_t
>
(
BatchStrideC_
);
}
__host__
__device__
constexpr
long_index_t
GetD0BasePtr
(
index_t
g_idx
)
const
{
return
g_idx
*
static_cast
<
long_index_t
>
(
BatchStrideD0_
);
}
__host__
__device__
constexpr
long_index_t
GetD1BasePtr
(
index_t
g_idx
)
const
template
<
index_t
I
>
__host__
__device__
constexpr
long_index_t
GetDBasePtr
(
index_t
g_idx
,
Number
<
I
>
reduction_idx
)
const
{
return
g_idx
*
static_cast
<
long_index_t
>
(
BatchStrideD1_
);
// TODO - Support sequence of StrideD in MakeArgument()
(
void
)
reduction_idx
;
return
g_idx
*
static_cast
<
long_index_t
>
(
BatchStrideD_
);
}
private:
index_t
BatchStrideA_
;
index_t
BatchStrideB_
;
index_t
BatchStrideC_
;
index_t
BatchStrideD0_
;
index_t
BatchStrideD1_
;
index_t
BatchStrideD_
;
};
// GridwiseGemm
...
...
@@ -558,14 +521,15 @@ struct DeviceBatchedGemmReduce_Xdl_CShuffle : public DeviceGemmReduce<AElementwi
CShuffleDataType
,
CDataType
,
ReduceAccDataType
,
D
DataType
,
D
PtrsGlobal
,
AElementwiseOperation
,
BElementwiseOperation
,
CElementwiseOperation
,
D0ReduceOperation
,
D1ReduceOperation
,
DxsReduceOperation
,
DxsInElementwiseOperation
,
DxsAccElementwiseOperation
,
InMemoryDataOperationEnum
::
Set
,
In
MemoryDataOperation
Enum
::
AtomicAdd
,
DGlobal
MemoryDataOperation
,
AGridDesc_AK0_M_AK1
,
BGridDesc_BK0_N_BK1
,
CGridDesc_M_N
,
...
...
@@ -603,9 +567,8 @@ struct DeviceBatchedGemmReduce_Xdl_CShuffle : public DeviceGemmReduce<AElementwi
CShuffleBlockTransferScalarPerVector_NPerBlock
,
CReduceThreadClusterLengths_MPerBlock_NPerBlock
,
CReduceThreadLds2VGprCopySrcDstScalarPerVector_NPerBlock
,
CReduceThreadVgpr2GlobalCopySrcDstScalarPerVector_MPerBlock
>
;
using
Block2CTileMap
=
decltype
(
MakeBlock2CTileMap
(
1
,
CGridDesc_M_N
{},
1
,
1
));
CReduceThreadVgpr2GlobalCopySrcDstScalarPerVector_MPerBlock
,
LoopSched
>
;
// Argument
struct
Argument
:
public
BaseArgument
...
...
@@ -613,8 +576,7 @@ struct DeviceBatchedGemmReduce_Xdl_CShuffle : public DeviceGemmReduce<AElementwi
Argument
(
const
ADataType
*
p_a_grid
,
const
BDataType
*
p_b_grid
,
CDataType
*
p_c_grid
,
DDataType
*
p_d0_grid
,
DDataType
*
p_d1_grid
,
DPtrsGlobal
p_ds_grid
,
index_t
MRaw
,
index_t
NRaw
,
index_t
KRaw
,
...
...
@@ -624,14 +586,13 @@ struct DeviceBatchedGemmReduce_Xdl_CShuffle : public DeviceGemmReduce<AElementwi
AElementwiseOperation
a_element_op
,
BElementwiseOperation
b_element_op
,
CElementwiseOperation
c_element_op
,
D
0ReduceOperation
d0_reduce
_op
,
D
1ReduceOperation
d1_reduce
_op
,
D
xsInElementwiseOperation
dxs_in_element
_op
,
D
xsAccElementwiseOperation
dxs_out_element
_op
,
index_t
BatchCount
)
:
p_a_grid_
{
p_a_grid
},
p_b_grid_
{
p_b_grid
},
p_c_grid_
{
p_c_grid
},
p_d0_grid_
{
p_d0_grid
},
p_d1_grid_
{
p_d1_grid
},
p_ds_grid_
{
p_ds_grid
},
BatchCount_
(
BatchCount
),
a_grid_desc_ak0_m_ak1_
{
DeviceOp
::
MakeAGridDescriptor_AK0_M_AK1
(
MRaw
,
KRaw
,
StrideA
)},
b_grid_desc_bk0_n_bk1_
{
DeviceOp
::
MakeBGridDescriptor_BK0_N_BK1
(
KRaw
,
NRaw
,
StrideB
)},
...
...
@@ -639,20 +600,22 @@ struct DeviceBatchedGemmReduce_Xdl_CShuffle : public DeviceGemmReduce<AElementwi
d_grid_desc_m_
{
DeviceOp
::
MakeDGridDescriptor_M
(
MRaw
)},
c_grid_desc_mblock_mperblock_nblock_nperblock_
{},
d_grid_desc_mblock_mperblock_
{},
compute_base_ptr_of_batch_
{
a_grid_desc_ak0_m_ak1_
.
GetElementSpaceSize
(),
b
_grid_desc_
b
k0_
n_b
k1_
.
GetElementSpaceSize
(),
c
_grid_desc_
m_n
_
.
GetElementSpaceSize
(),
d
_grid_desc_m_
.
GetElementSpaceSize
(),
d_grid_desc_m_
.
GetElementSpaceSize
()},
block_2_ctile_map_
{},
compute_base_ptr_of_batch_
{
type_convert
<
index_t
>
(
a
_grid_desc_
a
k0_
m_a
k1_
.
GetElementSpaceSize
()
)
,
type_convert
<
index_t
>
(
b
_grid_desc_
bk0_n_bk1
_
.
GetElementSpaceSize
()
)
,
type_convert
<
index_t
>
(
c
_grid_desc_m_
n_
.
GetElementSpaceSize
()
)
,
type_convert
<
index_t
>
(
d_grid_desc_m_
.
GetElementSpaceSize
()
)
},
block_2_ctile_map_
{
GridwiseGemm
::
MakeDefaultBlock2CTileMap
(
c_grid_desc_m_n_
)
},
a_element_op_
{
a_element_op
},
b_element_op_
{
b_element_op
},
c_element_op_
{
c_element_op
},
d
0_reduce_op_
{
d0_reduce
_op
},
d
1_reduce_op_
{
d1_reduce
_op
}
d
xs_in_element_op_
{
dxs_in_element
_op
},
d
xs_out_element_op_
{
dxs_out_element
_op
}
{
if
(
GridwiseGemm
::
CheckValidity
(
a_grid_desc_ak0_m_ak1_
,
b_grid_desc_bk0_n_bk1_
,
c_grid_desc_m_n_
))
if
(
GridwiseGemm
::
CheckValidity
(
a_grid_desc_ak0_m_ak1_
,
b_grid_desc_bk0_n_bk1_
,
c_grid_desc_m_n_
,
block_2_ctile_map_
))
{
c_grid_desc_mblock_mperblock_nblock_nperblock_
=
GridwiseGemm
::
MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
(
...
...
@@ -660,8 +623,6 @@ struct DeviceBatchedGemmReduce_Xdl_CShuffle : public DeviceGemmReduce<AElementwi
d_grid_desc_mblock_mperblock_
=
GridwiseGemm
::
MakeDGridDescriptor_MBlock_MPerBlock
(
d_grid_desc_m_
);
block_2_ctile_map_
=
MakeBlock2CTileMap
(
BatchCount
,
c_grid_desc_m_n_
,
1
,
1
);
}
}
...
...
@@ -669,8 +630,7 @@ struct DeviceBatchedGemmReduce_Xdl_CShuffle : public DeviceGemmReduce<AElementwi
const
ADataType
*
p_a_grid_
;
const
BDataType
*
p_b_grid_
;
CDataType
*
p_c_grid_
;
DDataType
*
p_d0_grid_
;
DDataType
*
p_d1_grid_
;
DPtrsGlobal
p_ds_grid_
;
index_t
BatchCount_
;
AGridDesc_AK0_M_AK1
a_grid_desc_ak0_m_ak1_
;
BGridDesc_BK0_N_BK1
b_grid_desc_bk0_n_bk1_
;
...
...
@@ -680,12 +640,12 @@ struct DeviceBatchedGemmReduce_Xdl_CShuffle : public DeviceGemmReduce<AElementwi
c_grid_desc_mblock_mperblock_nblock_nperblock_
;
typename
GridwiseGemm
::
DGridDescriptor_MBlock_MPerBlock
d_grid_desc_mblock_mperblock_
;
ComputeBasePtrOfStridedBatch
compute_base_ptr_of_batch_
;
Block2CTileMap
block_2_ctile_map_
;
typename
GridwiseGemm
::
Default
Block2CTileMap
block_2_ctile_map_
;
AElementwiseOperation
a_element_op_
;
BElementwiseOperation
b_element_op_
;
CElementwiseOperation
c_element_op_
;
D
0ReduceOperation
d0_reduce
_op_
;
D
1ReduceOperation
d1_reduce
_op_
;
D
xsInElementwiseOperation
dxs_in_element
_op_
;
D
xsAccElementwiseOperation
dxs_out_element
_op_
;
};
// Invoker
...
...
@@ -693,7 +653,7 @@ struct DeviceBatchedGemmReduce_Xdl_CShuffle : public DeviceGemmReduce<AElementwi
{
using
Argument
=
DeviceOp
::
Argument
;
float
Run
(
const
Argument
&
arg
,
int
/* nrepeat */
=
1
)
float
Run
(
const
Argument
&
arg
,
const
StreamConfig
&
stream_config
=
StreamConfig
{}
)
{
#if 0
{
...
...
@@ -717,60 +677,63 @@ struct DeviceBatchedGemmReduce_Xdl_CShuffle : public DeviceGemmReduce<AElementwi
}
#endif
if
(
!
GridwiseGemm
::
CheckValidity
(
arg
.
a_grid_desc_ak0_m_ak1_
,
arg
.
b_grid_desc_bk0_n_bk1_
,
arg
.
c_grid_desc_m_n_
))
if
(
!
GridwiseGemm
::
CheckValidity
(
arg
.
a_grid_desc_ak0_m_ak1_
,
arg
.
b_grid_desc_bk0_n_bk1_
,
arg
.
c_grid_desc_m_n_
,
arg
.
block_2_ctile_map_
))
{
throw
std
::
runtime_error
(
"wrong! GridwiseGemm has invalid setting"
);
}
const
index_t
grid_size
=
GridwiseGemm
::
CalculateGridSize
(
arg
.
c_grid_desc_m_n_
)
*
arg
.
BatchCount_
;
const
auto
K0
=
arg
.
a_grid_desc_ak0_m_ak1_
.
GetLength
(
I0
);
arg
.
block_2_ctile_map_
.
CalculateGridSize
(
arg
.
c_grid_desc_m_n_
)
*
arg
.
BatchCount_
;
const
bool
has_main_k0_block_loop
=
GridwiseGemm
::
CalculateHasMainK0BlockLoop
(
K0
);
const
auto
K
=
arg
.
a_grid_desc_ak0_m_ak1_
.
GetLength
(
I0
)
*
arg
.
a_grid_desc_ak0_m_ak1_
.
GetLength
(
I2
);
if
(
has_main_k0_block_loop
)
float
elapsed_time
=
0.0
f
;
if
(
GridwiseGemm
::
CalculateHasMainKBlockLoop
(
K
))
{
const
auto
kernel
=
kernel_batched_gemm_reduce_xdl_cshuffle_v1
<
GridwiseGemm
,
ADataType
,
// TODO: distiguish A/B datatype
CDataType
,
D
DataType
,
D
PtrsGlobal
,
AElementwiseOperation
,
BElementwiseOperation
,
CElementwiseOperation
,
D
0Reduc
eOperation
,
D
1Reduc
eOperation
,
D
xsInElementwis
eOperation
,
D
xsAccElementwis
eOperation
,
DeviceOp
::
AGridDesc_AK0_M_AK1
,
DeviceOp
::
BGridDesc_BK0_N_BK1
,
typename
GridwiseGemm
::
CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
,
typename
GridwiseGemm
::
DGridDescriptor_MBlock_MPerBlock
,
ComputeBasePtrOfStridedBatch
,
remove_reference_t
<
Block2CTileMap
>
,
typename
GridwiseGemm
::
Default
Block2CTileMap
,
true
>
;
launch_kernel
(
kernel
,
dim3
(
grid_size
),
dim3
(
BlockSize
),
0
,
arg
.
p_a_grid_
,
arg
.
p_b_grid_
,
arg
.
p_c_grid_
,
arg
.
p_d0_grid_
,
arg
.
p_d1_grid_
,
arg
.
BatchCount_
,
arg
.
a_element_op_
,
arg
.
b_element_op_
,
arg
.
c_element_op_
,
arg
.
d0_reduce_op_
,
arg
.
d1_reduce_op_
,
arg
.
a_grid_desc_ak0_m_ak1_
,
arg
.
b_grid_desc_bk0_n_bk1_
,
arg
.
c_grid_desc_mblock_mperblock_nblock_nperblock_
,
arg
.
d_grid_desc_mblock_mperblock_
,
arg
.
compute_base_ptr_of_batch_
,
arg
.
block_2_ctile_map_
);
elapsed_time
=
launch_and_time_kernel
(
stream_config
,
kernel
,
dim3
(
grid_size
),
dim3
(
BlockSize
),
0
,
arg
.
p_a_grid_
,
arg
.
p_b_grid_
,
arg
.
p_c_grid_
,
arg
.
p_ds_grid_
,
arg
.
BatchCount_
,
arg
.
a_element_op_
,
arg
.
b_element_op_
,
arg
.
c_element_op_
,
arg
.
dxs_in_element_op_
,
arg
.
dxs_out_element_op_
,
arg
.
a_grid_desc_ak0_m_ak1_
,
arg
.
b_grid_desc_bk0_n_bk1_
,
arg
.
c_grid_desc_mblock_mperblock_nblock_nperblock_
,
arg
.
d_grid_desc_mblock_mperblock_
,
arg
.
compute_base_ptr_of_batch_
,
arg
.
block_2_ctile_map_
);
}
else
{
...
...
@@ -778,50 +741,52 @@ struct DeviceBatchedGemmReduce_Xdl_CShuffle : public DeviceGemmReduce<AElementwi
GridwiseGemm
,
ADataType
,
// TODO: distiguish A/B datatype
CDataType
,
D
DataType
,
D
PtrsGlobal
,
AElementwiseOperation
,
BElementwiseOperation
,
CElementwiseOperation
,
D
0Reduc
eOperation
,
D
1Reduc
eOperation
,
D
xsInElementwis
eOperation
,
D
xsAccElementwis
eOperation
,
DeviceOp
::
AGridDesc_AK0_M_AK1
,
DeviceOp
::
BGridDesc_BK0_N_BK1
,
typename
GridwiseGemm
::
CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
,
typename
GridwiseGemm
::
DGridDescriptor_MBlock_MPerBlock
,
ComputeBasePtrOfStridedBatch
,
remove_reference_t
<
Block2CTileMap
>
,
typename
GridwiseGemm
::
Default
Block2CTileMap
,
false
>
;
launch_kernel
(
kernel
,
dim3
(
grid_size
),
dim3
(
BlockSize
),
0
,
arg
.
p_a_grid_
,
arg
.
p_b_grid_
,
arg
.
p_c_grid_
,
arg
.
p_d0_grid_
,
arg
.
p_d1_grid_
,
arg
.
BatchCount_
,
arg
.
a_element_op_
,
arg
.
b_element_op_
,
arg
.
c_element_op_
,
arg
.
d0_reduce_op_
,
arg
.
d1_reduce_op_
,
arg
.
a_grid_desc_ak0_m_ak1_
,
arg
.
b_grid_desc_bk0_n_bk1_
,
arg
.
c_grid_desc_mblock_mperblock_nblock_nperblock_
,
arg
.
d_grid_desc_mblock_mperblock_
,
arg
.
compute_base_ptr_of_batch_
,
arg
.
block_2_ctile_map_
);
elapsed_time
=
launch_and_time_kernel
(
stream_config
,
kernel
,
dim3
(
grid_size
),
dim3
(
BlockSize
),
0
,
arg
.
p_a_grid_
,
arg
.
p_b_grid_
,
arg
.
p_c_grid_
,
arg
.
p_ds_grid_
,
arg
.
BatchCount_
,
arg
.
a_element_op_
,
arg
.
b_element_op_
,
arg
.
c_element_op_
,
arg
.
dxs_in_element_op_
,
arg
.
dxs_out_element_op_
,
arg
.
a_grid_desc_ak0_m_ak1_
,
arg
.
b_grid_desc_bk0_n_bk1_
,
arg
.
c_grid_desc_mblock_mperblock_nblock_nperblock_
,
arg
.
d_grid_desc_mblock_mperblock_
,
arg
.
compute_base_ptr_of_batch_
,
arg
.
block_2_ctile_map_
);
}
return
0
;
return
elapsed_time
;
}
// polymorphic
float
Run
(
const
BaseArgument
*
p_arg
,
int
nrepeat
=
1
)
override
float
Run
(
const
BaseArgument
*
p_arg
,
const
StreamConfig
&
stream_config
=
StreamConfig
{})
override
{
return
Run
(
*
dynamic_cast
<
const
Argument
*>
(
p_arg
),
nrepeat
);
return
Run
(
*
dynamic_cast
<
const
Argument
*>
(
p_arg
),
stream_config
);
}
};
...
...
@@ -833,8 +798,10 @@ struct DeviceBatchedGemmReduce_Xdl_CShuffle : public DeviceGemmReduce<AElementwi
static
bool
IsSupportedArgument
(
const
Argument
&
arg
)
{
return
GridwiseGemm
::
CheckValidity
(
arg
.
a_grid_desc_ak0_m_ak1_
,
arg
.
b_grid_desc_bk0_n_bk1_
,
arg
.
c_grid_desc_m_n_
);
return
GridwiseGemm
::
CheckValidity
(
arg
.
a_grid_desc_ak0_m_ak1_
,
arg
.
b_grid_desc_bk0_n_bk1_
,
arg
.
c_grid_desc_m_n_
,
arg
.
block_2_ctile_map_
);
}
// polymorphic
...
...
@@ -854,8 +821,7 @@ struct DeviceBatchedGemmReduce_Xdl_CShuffle : public DeviceGemmReduce<AElementwi
static
auto
MakeArgument
(
const
ADataType
*
p_a
,
const
BDataType
*
p_b
,
CDataType
*
p_c
,
DDataType
*
p_d0
,
DDataType
*
p_d1
,
DPtrsGlobal
p_dxs
,
index_t
MRaw
,
index_t
NRaw
,
index_t
KRaw
,
...
...
@@ -865,15 +831,14 @@ struct DeviceBatchedGemmReduce_Xdl_CShuffle : public DeviceGemmReduce<AElementwi
AElementwiseOperation
a_element_op
,
BElementwiseOperation
b_element_op
,
CElementwiseOperation
c_element_op
,
D
0ReduceOperation
d0_reduce
_op
,
D
1ReduceOperation
d1_reduce
_op
,
D
xsInElementwiseOperation
dxs_in_element
_op
,
D
xsAccElementwiseOperation
dxs_out_element
_op
,
index_t
BatchCount
)
{
return
Argument
{
p_a
,
p_b
,
p_c
,
p_d0
,
p_d1
,
p_dxs
,
MRaw
,
NRaw
,
KRaw
,
...
...
@@ -883,8 +848,8 @@ struct DeviceBatchedGemmReduce_Xdl_CShuffle : public DeviceGemmReduce<AElementwi
a_element_op
,
b_element_op
,
c_element_op
,
d
0_reduce
_op
,
d
1_reduce
_op
,
d
xs_in_element
_op
,
d
xs_out_element
_op
,
BatchCount
};
}
...
...
@@ -894,8 +859,7 @@ struct DeviceBatchedGemmReduce_Xdl_CShuffle : public DeviceGemmReduce<AElementwi
std
::
unique_ptr
<
BaseArgument
>
MakeArgumentPointer
(
const
void
*
p_a
,
const
void
*
p_b
,
void
*
p_c
,
void
*
p_d0
,
void
*
p_d1
,
DPtrsGlobal
p_dxs
,
index_t
MRaw
,
index_t
NRaw
,
index_t
KRaw
,
...
...
@@ -905,15 +869,14 @@ struct DeviceBatchedGemmReduce_Xdl_CShuffle : public DeviceGemmReduce<AElementwi
AElementwiseOperation
a_element_op
,
BElementwiseOperation
b_element_op
,
CElementwiseOperation
c_element_op
,
D
0ReduceOperation
d0_reduce
_op
,
D
1ReduceOperation
d1_reduce
_op
,
D
xsInElementwiseOperation
dxs_in_element
_op
,
D
xsAccElementwiseOperation
dxs_out_element
_op
,
index_t
BatchCount
)
override
{
return
std
::
make_unique
<
Argument
>
(
static_cast
<
const
ADataType
*>
(
p_a
),
static_cast
<
const
BDataType
*>
(
p_b
),
static_cast
<
CDataType
*>
(
p_c
),
static_cast
<
DDataType
*>
(
p_d0
),
static_cast
<
DDataType
*>
(
p_d1
),
p_dxs
,
MRaw
,
NRaw
,
KRaw
,
...
...
@@ -923,8 +886,8 @@ struct DeviceBatchedGemmReduce_Xdl_CShuffle : public DeviceGemmReduce<AElementwi
a_element_op
,
b_element_op
,
c_element_op
,
d
0_reduce
_op
,
d
1_reduce
_op
,
d
xs_in_element
_op
,
d
xs_out_element
_op
,
BatchCount
);
}
...
...
include/ck/tensor_operation/gpu/device/device_batched_gemm_xdl.hpp
View file @
a3b4c5cb
...
...
@@ -107,7 +107,7 @@ __global__ void
ignore
=
a_element_op
;
ignore
=
b_element_op
;
ignore
=
c_element_op
;
ignore
=
compute_
base_ptr
_of_batch
_
;
ignore
=
compute_
ptr_offset
_of_batch
;
ignore
=
block_2_ctile_map
;
#endif // end of if (defined(__gfx908__) || defined(__gfx90a__))
}
...
...
@@ -243,44 +243,6 @@ struct DeviceBatchedGemmXdl
using
BGridDesc_K0_N_K1
=
decltype
(
MakeBGridDescriptor_K0_N_K1
(
1
,
1
,
1
));
using
CGridDesc_M_N
=
decltype
(
MakeCGridDescriptor_M_N
(
1
,
1
,
1
));
static
constexpr
auto
MakeBlock2CTileMap
(
index_t
batch_count
,
const
CGridDesc_M_N
&
c_grid_desc_m_n
,
index_t
M01
,
index_t
N01
)
{
const
auto
M
=
c_grid_desc_m_n
.
GetLength
(
I0
);
const
auto
N
=
c_grid_desc_m_n
.
GetLength
(
I1
);
constexpr
auto
M1
=
Number
<
MPerBlock
>
{};
constexpr
auto
N1
=
Number
<
NPerBlock
>
{};
const
auto
M0
=
M
/
M1
;
const
auto
N0
=
N
/
N1
;
const
auto
M00
=
M0
/
M01
;
const
auto
N00
=
N0
/
N01
;
const
auto
g_m00_m01_n00_n01_to_m0_n0_block_cluster_adaptor
=
make_single_stage_tensor_adaptor
(
make_tuple
(
make_insert_transform
(
batch_count
),
make_unmerge_transform
(
make_tuple
(
M00
,
M01
)),
make_unmerge_transform
(
make_tuple
(
N00
,
N01
))),
make_tuple
(
Sequence
<>
{},
Sequence
<
0
>
{},
Sequence
<
1
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
,
3
>
{},
Sequence
<
2
,
4
>
{}));
const
auto
globalblockid_to_m00_m01_n00_n01_block_cluster_adaptor
=
make_single_stage_tensor_adaptor
(
make_tuple
(
make_merge_transform
(
make_tuple
(
batch_count
,
M00
,
N00
,
M01
,
N01
))),
make_tuple
(
Sequence
<
0
,
1
,
2
,
3
,
4
>
{}),
make_tuple
(
Sequence
<
0
>
{}));
const
auto
globalblockid_to_m0_n0_block_cluster_adaptor
=
chain_tensor_adaptors
(
g_m00_m01_n00_n01_to_m0_n0_block_cluster_adaptor
,
globalblockid_to_m00_m01_n00_n01_block_cluster_adaptor
);
return
globalblockid_to_m0_n0_block_cluster_adaptor
;
}
struct
ComputePtrOffsetOfStridedBatch
{
ComputePtrOffsetOfStridedBatch
(
index_t
BatchStrideA
,
...
...
@@ -354,7 +316,7 @@ struct DeviceBatchedGemmXdl
using
CGridDesc_M0_N0_M1_N1_M2_M3_M4_N2
=
decltype
(
GridwiseGemm
::
MakeCGridDescriptor_M0_N0_M1_N1_M2_M3_M4_N2
(
CGridDesc_M_N
{}));
using
Block2CTileMap
=
decltype
(
MakeBlock2CTileMap
(
1
,
CGridDesc_M_N
{},
1
,
1
))
;
using
Block2CTileMap
=
typename
GridwiseGemm
::
DefaultBlock2CTileMap
;
// Argument
struct
Argument
:
public
BaseArgument
...
...
@@ -384,23 +346,25 @@ struct DeviceBatchedGemmXdl
DeviceBatchedGemmXdl
::
MakeBGridDescriptor_K0_N_K1
(
K
,
N
,
StrideB
)},
c_grid_desc_m_n_
{
DeviceBatchedGemmXdl
::
MakeCGridDescriptor_M_N
(
M
,
N
,
StrideC
)},
c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2_
{},
compute_ptr_offset_of_batch_
{
a_grid_desc_k0_m_k1_
.
GetElementSpaceSize
(),
b_grid_desc_k0_n_k1_
.
GetElementSpaceSize
(),
c_grid_desc_m_n_
.
GetElementSpaceSize
()},
block_2_ctile_map_
{},
compute_ptr_offset_of_batch_
{
type_convert
<
index_t
>
(
a_grid_desc_k0_m_k1_
.
GetElementSpaceSize
()),
type_convert
<
index_t
>
(
b_grid_desc_k0_n_k1_
.
GetElementSpaceSize
()),
type_convert
<
index_t
>
(
c_grid_desc_m_n_
.
GetElementSpaceSize
())},
block_2_ctile_map_
{
GridwiseGemm
::
MakeDefaultBlock2CTileMap
(
c_grid_desc_m_n_
,
M01
,
N01
)},
M01_
{
M01
},
N01_
{
N01
},
a_element_op_
{
a_element_op
},
b_element_op_
{
b_element_op
},
c_element_op_
{
c_element_op
}
{
if
(
GridwiseGemm
::
CheckValidity
(
a_grid_desc_k0_m_k1_
,
b_grid_desc_k0_n_k1_
,
c_grid_desc_m_n_
,
M01_
,
N01_
))
if
(
GridwiseGemm
::
CheckValidity
(
a_grid_desc_k0_m_k1_
,
b_grid_desc_k0_n_k1_
,
c_grid_desc_m_n_
,
block_2_ctile_map_
))
{
c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2_
=
GridwiseGemm
::
MakeCGridDescriptor_M0_N0_M1_N1_M2_M3_M4_N2
(
c_grid_desc_m_n_
);
block_2_ctile_map_
=
MakeBlock2CTileMap
(
BatchCount
,
c_grid_desc_m_n_
,
M01
,
N01
);
}
}
...
...
@@ -427,7 +391,7 @@ struct DeviceBatchedGemmXdl
{
using
Argument
=
DeviceBatchedGemmXdl
::
Argument
;
float
Run
(
const
Argument
&
arg
,
int
nrepeat
=
1
)
float
Run
(
const
Argument
&
arg
,
const
StreamConfig
&
stream_config
=
StreamConfig
{}
)
{
{
std
::
cout
<<
"arg.a_grid_desc_k0_m_k1_{"
<<
arg
.
a_grid_desc_k0_m_k1_
.
GetLength
(
I0
)
...
...
@@ -445,23 +409,21 @@ struct DeviceBatchedGemmXdl
if
(
!
GridwiseGemm
::
CheckValidity
(
arg
.
a_grid_desc_k0_m_k1_
,
arg
.
b_grid_desc_k0_n_k1_
,
arg
.
c_grid_desc_m_n_
,
arg
.
M01_
,
arg
.
N01_
))
arg
.
block_2_ctile_map_
))
{
throw
std
::
runtime_error
(
"wrong! GridwiseBatchedGemm_km_kn_m0m1n0n1_xdlops_v2r3 has invalid setting"
);
}
const
index_t
grid_size
=
GridwiseGemm
::
CalculateGridSize
(
arg
.
c_grid_desc_m_n_
)
*
arg
.
BatchCount_
;
const
auto
K0
=
arg
.
a_grid_desc_k0_m_k1_
.
GetLength
(
I0
);
arg
.
block_2_ctile_map_
.
CalculateGridSize
(
arg
.
c_grid_desc_m_n_
)
*
arg
.
BatchCount_
;
const
bool
has_main_k0_block_loop
=
GridwiseGemm
::
CalculateHasMainK0BlockLoop
(
K0
);
const
auto
K
=
arg
.
a_grid_desc_k0_m_k1_
.
GetLength
(
I0
)
*
arg
.
a_grid_desc_k0_m_k1_
.
GetLength
(
I2
);
float
ave_time
=
0
;
if
(
has_main_k0_b
lock
_l
oop
)
if
(
GridwiseGemm
::
CalculateHasMainKB
lock
L
oop
(
K
)
)
{
const
auto
kernel
=
kernel_batched_gemm_xdlops_v2r3
<
GridwiseGemm
,
...
...
@@ -477,8 +439,8 @@ struct DeviceBatchedGemmXdl
remove_reference_t
<
Block2CTileMap
>
,
true
>
;
ave_time
=
launch_and_time_kernel
(
kernel
,
nrepeat
,
ave_time
=
launch_and_time_kernel
(
stream_config
,
kernel
,
dim3
(
grid_size
),
dim3
(
BlockSize
),
0
,
...
...
@@ -511,8 +473,8 @@ struct DeviceBatchedGemmXdl
remove_reference_t
<
Block2CTileMap
>
,
false
>
;
ave_time
=
launch_and_time_kernel
(
kernel
,
nrepeat
,
ave_time
=
launch_and_time_kernel
(
stream_config
,
kernel
,
dim3
(
grid_size
),
dim3
(
BlockSize
),
0
,
...
...
@@ -534,9 +496,10 @@ struct DeviceBatchedGemmXdl
}
// polymorphic
float
Run
(
const
BaseArgument
*
p_arg
,
int
nrepeat
=
1
)
override
float
Run
(
const
BaseArgument
*
p_arg
,
const
StreamConfig
&
stream_config
=
StreamConfig
{})
override
{
return
Run
(
*
dynamic_cast
<
const
Argument
*>
(
p_arg
),
nrepeat
);
return
Run
(
*
dynamic_cast
<
const
Argument
*>
(
p_arg
),
stream_config
);
}
};
...
...
@@ -551,8 +514,7 @@ struct DeviceBatchedGemmXdl
return
GridwiseGemm
::
CheckValidity
(
arg
.
a_grid_desc_k0_m_k1_
,
arg
.
b_grid_desc_k0_n_k1_
,
arg
.
c_grid_desc_m_n_
,
arg
.
M01_
,
arg
.
N01_
);
arg
.
block_2_ctile_map_
);
}
// polymorphic
...
...
include/ck/tensor_operation/gpu/device/device_binary_elementwise.hpp
0 → 100644
View file @
a3b4c5cb
#pragma once
#include <iostream>
#include <vector>
#include "device.hpp"
#include "device_base.hpp"
#include "gridwise_binary_elementwise_1d.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
template
<
typename
ADataType
,
typename
BDataType
,
typename
CDataType
,
typename
ComputeDataType
,
typename
ElementwiseFunctor
,
index_t
NDim
,
index_t
MPerThread
,
index_t
AScalarPerVector
,
index_t
BScalarPerVector
,
index_t
CScalarPerVector
>
struct
DeviceBinaryElementwise
:
public
BaseOperator
{
static
constexpr
auto
I0
=
Number
<
0
>
{};
template
<
typename
Desc_M
>
static
auto
PadDescriptor_M_1d
(
Desc_M
desc_m
,
index_t
gridSize
,
index_t
blockSize
)
{
const
auto
M
=
desc_m
.
GetLength
(
I0
);
const
index_t
loop_step
=
gridSize
*
blockSize
*
MPerThread
;
const
auto
pad
=
math
::
integer_least_multiple
(
M
,
loop_step
)
-
M
;
const
auto
desc_m_pad
=
transform_tensor_descriptor
(
desc_m
,
make_tuple
(
make_right_pad_transform
(
M
,
pad
)),
make_tuple
(
Sequence
<
0
>
{}),
make_tuple
(
Sequence
<
0
>
{}));
return
desc_m_pad
;
}
static
auto
MakeDescriptor_M
(
const
std
::
vector
<
index_t
>&
lengths
,
const
std
::
vector
<
index_t
>&
strides
,
index_t
gridSize
,
index_t
blockSize
)
{
auto
tupleOfShape
=
generate_tuple
([
&
](
auto
I
)
{
return
lengths
[
I
];
},
Number
<
NDim
>
{});
auto
tupleOfStride
=
generate_tuple
([
&
](
auto
I
)
{
return
strides
[
I
];
},
Number
<
NDim
>
{});
// nd desc - [s0, s1, s2, ...]
const
auto
desc
=
make_naive_tensor_descriptor
(
tupleOfShape
,
tupleOfStride
);
// merge nd to 1d desc - [s0 * s1 * ...]
if
constexpr
(
NDim
>
1
)
{
const
auto
desc_m
=
transform_tensor_descriptor
(
desc
,
make_tuple
(
make_merge_transform
(
tupleOfShape
)),
make_tuple
(
generate_sequence_v2
([
&
](
auto
I
)
{
return
I
;
},
Number
<
NDim
>
{})),
make_tuple
(
Sequence
<
0
>
{}));
return
PadDescriptor_M_1d
(
desc_m
,
gridSize
,
blockSize
);
}
else
return
PadDescriptor_M_1d
(
desc
,
gridSize
,
blockSize
);
}
using
AGridDesc_M
=
decltype
(
MakeDescriptor_M
({
1
,
1
},
{
1
,
1
},
1
,
1
));
using
BGridDesc_M
=
decltype
(
MakeDescriptor_M
({
1
,
1
},
{
1
,
1
},
1
,
1
));
using
CGridDesc_M
=
decltype
(
MakeDescriptor_M
({
1
,
1
},
{
1
,
1
},
1
,
1
));
using
GridwiseBinEltwise
=
GridwiseBinaryElementwise_1D
<
ADataType
,
BDataType
,
CDataType
,
ComputeDataType
,
AGridDesc_M
,
BGridDesc_M
,
CGridDesc_M
,
ElementwiseFunctor
,
MPerThread
,
AScalarPerVector
,
BScalarPerVector
,
CScalarPerVector
>
;
struct
Argument
:
public
BaseArgument
{
Argument
(
const
ADataType
*
p_a
,
const
BDataType
*
p_b
,
CDataType
*
p_c
,
const
std
::
vector
<
index_t
>&
lengths
,
const
std
::
vector
<
index_t
>&
a_strides
,
const
std
::
vector
<
index_t
>&
b_strides
,
const
std
::
vector
<
index_t
>&
c_strides
,
ElementwiseFunctor
functor
)
:
p_a_
(
p_a
),
p_b_
(
p_b
),
p_c_
(
p_c
),
lengths_
(
lengths
),
a_strides_
(
a_strides
),
b_strides_
(
b_strides
),
c_strides_
(
c_strides
),
functor_
(
functor
),
blockSize_
(
256
),
gridSize_
(
120
)
// FIXME - Calculate the grid size by number of CU in the future
{
a_grid_desc_m_
=
MakeDescriptor_M
(
lengths
,
a_strides
,
gridSize_
,
blockSize_
);
b_grid_desc_m_
=
MakeDescriptor_M
(
lengths
,
b_strides
,
gridSize_
,
blockSize_
);
c_grid_desc_m_
=
MakeDescriptor_M
(
lengths
,
c_strides
,
gridSize_
,
blockSize_
);
}
const
ADataType
*
p_a_
;
const
BDataType
*
p_b_
;
CDataType
*
p_c_
;
std
::
vector
<
int
>
lengths_
;
AGridDesc_M
a_grid_desc_m_
;
BGridDesc_M
b_grid_desc_m_
;
CGridDesc_M
c_grid_desc_m_
;
std
::
vector
<
index_t
>
a_strides_
;
std
::
vector
<
index_t
>
b_strides_
;
std
::
vector
<
index_t
>
c_strides_
;
ElementwiseFunctor
functor_
;
index_t
blockSize_
;
index_t
gridSize_
;
};
struct
Invoker
:
public
BaseInvoker
{
float
Run
(
const
Argument
&
arg
,
const
StreamConfig
&
stream_config
=
StreamConfig
{})
{
const
auto
kernel
=
kernel_binary_elementwise_1d
<
GridwiseBinEltwise
,
ADataType
,
BDataType
,
CDataType
,
AGridDesc_M
,
BGridDesc_M
,
CGridDesc_M
,
ElementwiseFunctor
>
;
float
elapsed_time
=
launch_and_time_kernel
(
stream_config
,
kernel
,
dim3
(
arg
.
gridSize_
),
dim3
(
arg
.
blockSize_
),
0
,
arg
.
p_a_
,
arg
.
p_b_
,
arg
.
p_c_
,
arg
.
a_grid_desc_m_
,
arg
.
b_grid_desc_m_
,
arg
.
c_grid_desc_m_
,
arg
.
functor_
);
return
elapsed_time
;
}
// polymorphic
float
Run
(
const
BaseArgument
*
p_arg
,
const
StreamConfig
&
stream_config
=
StreamConfig
{})
override
{
return
Run
(
*
dynamic_cast
<
const
Argument
*>
(
p_arg
),
stream_config
);
}
};
bool
IsSupportedArgument
(
const
BaseArgument
*
p_arg
)
override
{
const
Argument
*
pArg
=
dynamic_cast
<
const
Argument
*>
(
p_arg
);
if
(
pArg
==
nullptr
)
return
false
;
if
(
pArg
->
lengths_
.
size
()
!=
NDim
)
return
false
;
if
(
pArg
->
lengths_
.
back
()
%
MPerThread
!=
0
)
return
false
;
auto
IsScalarPerVectorValid
=
[](
bool
isLastDimensionCoalesced
,
int
scalarPerVector
)
{
bool
ret
=
true
;
if
(
!
isLastDimensionCoalesced
)
ret
=
scalarPerVector
==
1
;
else
ret
=
MPerThread
%
scalarPerVector
==
0
;
return
ret
;
};
if
(
!
IsScalarPerVectorValid
(
pArg
->
a_strides_
.
back
()
==
1
,
AScalarPerVector
))
return
false
;
if
(
!
IsScalarPerVectorValid
(
pArg
->
b_strides_
.
back
()
==
1
,
BScalarPerVector
))
return
false
;
if
(
!
IsScalarPerVectorValid
(
pArg
->
c_strides_
.
back
()
==
1
,
CScalarPerVector
))
return
false
;
return
true
;
};
std
::
unique_ptr
<
BaseArgument
>
MakeArgumentPointer
(
const
void
*
p_a
,
const
void
*
p_b
,
void
*
p_c
,
std
::
vector
<
index_t
>
lengths
,
std
::
vector
<
index_t
>
a_strides
,
std
::
vector
<
index_t
>
b_strides
,
std
::
vector
<
index_t
>
c_strides
,
ElementwiseFunctor
functor
)
{
return
std
::
make_unique
<
Argument
>
(
static_cast
<
const
ADataType
*>
(
p_a
),
static_cast
<
const
BDataType
*>
(
p_b
),
static_cast
<
CDataType
*>
(
p_c
),
lengths
,
a_strides
,
b_strides
,
c_strides
,
functor
);
}
std
::
unique_ptr
<
BaseInvoker
>
MakeInvokerPointer
()
{
return
std
::
make_unique
<
Invoker
>
();
}
std
::
string
GetTypeString
()
const
override
{
auto
str
=
std
::
stringstream
();
// clang-format off
str
<<
"DeviceBinaryElementwise"
<<
"<"
<<
"MPerThread = "
<<
MPerThread
<<
">"
;
// clang-format on
return
str
.
str
();
}
};
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
include/ck/tensor_operation/gpu/device/device_cgemm.hpp
0 → 100644
View file @
a3b4c5cb
/*******************************************************************************
*
* MIT License
*
* Copyright (c) 2022 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*******************************************************************************/
#pragma once
#include "device_base.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
template
<
typename
AElementwiseOperation
,
typename
BElementwiseOperation
,
typename
CElementwiseOperation
>
struct
DeviceCGemm
:
public
BaseOperator
{
virtual
std
::
unique_ptr
<
BaseArgument
>
MakeArgumentPointer
(
const
void
*
p_a_real
,
const
void
*
p_a_imag
,
const
void
*
p_b_real
,
const
void
*
p_b_imag
,
void
*
p_c_real
,
void
*
p_c_imag
,
void
*
p_workspace
,
ck
::
index_t
M
,
ck
::
index_t
N
,
ck
::
index_t
K
,
ck
::
index_t
StrideA
,
ck
::
index_t
StrideB
,
ck
::
index_t
StrideC
,
AElementwiseOperation
a_element_op
,
BElementwiseOperation
b_element_op
,
CElementwiseOperation
c_element_op
,
ck
::
index_t
KBatch
=
1
)
=
0
;
virtual
std
::
unique_ptr
<
BaseInvoker
>
MakeInvokerPointer
()
=
0
;
virtual
std
::
size_t
GetWorkspaceSize
(
index_t
MRaw
,
index_t
NRaw
,
index_t
KRaw
,
index_t
StrideA
,
index_t
StrideB
,
index_t
StrideC
)
=
0
;
};
template
<
typename
AElementwiseOperation
,
typename
BElementwiseOperation
,
typename
CElementwiseOperation
>
using
DeviceCGemmPtr
=
std
::
unique_ptr
<
DeviceCGemm
<
AElementwiseOperation
,
BElementwiseOperation
,
CElementwiseOperation
>>
;
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
Prev
1
2
3
4
5
6
7
8
…
19
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment