Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
0e877b84
Commit
0e877b84
authored
Apr 10, 2022
by
Chao Liu
Browse files
adding thread group
parent
3f4af14c
Changes
5
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
14 additions
and
19 deletions
+14
-19
include/ck/config.hpp
include/ck/config.hpp
+1
-4
include/ck/tensor_operation/gpu/block/blockwise_gemm_xdlops.hpp
...e/ck/tensor_operation/gpu/block/blockwise_gemm_xdlops.hpp
+6
-9
include/ck/tensor_operation/gpu/block/blockwise_tensor_slice_transfer_v4r1.hpp
...ration/gpu/block/blockwise_tensor_slice_transfer_v4r1.hpp
+1
-4
include/ck/utility/common_header.hpp
include/ck/utility/common_header.hpp
+1
-0
include/ck/utility/get_id.hpp
include/ck/utility/get_id.hpp
+5
-2
No files found.
include/ck/config.hpp
View file @
0e877b84
...
...
@@ -26,17 +26,14 @@
#endif
#endif
// buffer resourse
, wave size
// buffer resourse
#ifndef __HIP_DEVICE_COMPILE__ // for host code
#define CK_BUFFER_RESOURCE_3RD_DWORD -1
#define CK_GPU_WAVE_SIZE -1
#elif defined(__gfx803__) || defined(__gfx900__) || defined(__gfx906__) || defined(__gfx908__) || \
defined(__gfx90a__) // for GPU code
#define CK_BUFFER_RESOURCE_3RD_DWORD 0x00020000
#define CK_GPU_WAVE_SIZE 64
#elif defined(__gfx1030__) // for GPU code
#define CK_BUFFER_RESOURCE_3RD_DWORD 0x31014000
#define CK_GPU_WAVE_SIZE 32
#endif
// FMA instruction
...
...
include/ck/tensor_operation/gpu/block/blockwise_gemm_xdlops.hpp
View file @
0e877b84
#ifndef CK_BLOCKWISE_GEMM_XDLOPS_HPP
#define CK_BLOCKWISE_GEMM_XDLOPS_HPP
#pragma once
#include "common_header.hpp"
#include "threadwise_tensor_slice_transfer.hpp"
#include "xdlops_gemm.hpp"
...
...
@@ -8,7 +6,7 @@
namespace
ck
{
template
<
index_t
BlockSize
,
template
<
typename
ThreadGroup
,
typename
FloatAB
,
typename
FloatAcc
,
typename
AK0MK1BlockDesc
,
...
...
@@ -25,7 +23,7 @@ struct BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
static
constexpr
auto
I2
=
Number
<
2
>
{};
static
constexpr
auto
I3
=
Number
<
3
>
{};
static
constexpr
index_t
WaveSize
=
64
;
static
constexpr
index_t
WaveSize
=
get_warp_size
()
;
static
constexpr
index_t
MPerBlock
=
AK0MK1BlockDesc
{}.
GetLength
(
I1
);
static
constexpr
index_t
NPerBlock
=
BK0NK1BlockDesc
{}.
GetLength
(
I1
);
...
...
@@ -53,7 +51,7 @@ struct BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
__device__
static
auto
GetWaveIdx
()
{
const
index_t
thread_id
=
get_thread_local_1d_i
d
();
const
index_t
thread_id
=
ThreadGroup
::
GetThreadI
d
();
constexpr
auto
threadid_to_wave_idx_adaptor
=
make_single_stage_tensor_adaptor
(
make_tuple
(
make_merge_transform
(
make_tuple
(
MWaves
,
NWaves
,
WaveSize
))),
...
...
@@ -120,8 +118,8 @@ struct BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
BK0NK1BlockDesc
::
IsKnownAtCompileTime
(),
"wrong! Desc should be known at compile-time"
);
static_assert
(
BlockSize
==
MWaves
*
NWaves
*
WaveSize
,
"
BlockSize
!= MWaves * NWaves * WaveSize
\n
"
);
static_assert
(
ThreadGroup
::
GetNumOfThread
()
==
MWaves
*
NWaves
*
WaveSize
,
"
ThreadGroup::GetNumOfThread()
!= MWaves * NWaves * WaveSize
\n
"
);
static_assert
(
MPerBlock
%
(
MPerXDL
*
MRepeat
)
==
0
&&
NPerBlock
%
(
NPerXDL
*
NRepeat
)
==
0
,
"wrong!"
);
...
...
@@ -337,4 +335,3 @@ struct BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
};
}
// namespace ck
#endif
include/ck/tensor_operation/gpu/block/blockwise_tensor_slice_transfer_v4r1.hpp
View file @
0e877b84
#ifndef CK_BLOCKWISE_TENSOR_SLICE_TRANSFER_V4R1_HPP
#define CK_BLOCKWISE_TENSOR_SLICE_TRANSFER_V4R1_HPP
#pragma once
#include "common_header.hpp"
#include "tensor_descriptor.hpp"
#include "tensor_descriptor_helper.hpp"
...
...
@@ -169,4 +167,3 @@ struct BlockwiseTensorSliceTransfer_v4r1
};
}
// namespace ck
#endif
include/ck/utility/common_header.hpp
View file @
0e877b84
...
...
@@ -27,6 +27,7 @@
#include "transpose_vectors.hpp"
#include "inner_product.hpp"
#include "element_wise_operation.hpp"
#include "thread_group.hpp"
#include "debug.hpp"
#include "amd_buffer_addressing.hpp"
...
...
include/ck/utility/get_id.hpp
View file @
0e877b84
...
...
@@ -3,11 +3,14 @@
namespace
ck
{
__device__
constexpr
index_t
get_wave_size
()
{
return
CK_GPU_WAVE_SIZE
;
}
__host__
__device__
constexpr
index_t
get_warp_size
()
{
// warpSize is defined by HIP
return
warpSize
;
}
__device__
index_t
get_thread_local_1d_id
()
{
return
threadIdx
.
x
;
}
__device__
index_t
get_wa
ve
_local_1d_id
()
{
return
threadIdx
.
x
/
get_wa
ve
_size
();
}
__device__
index_t
get_wa
rp
_local_1d_id
()
{
return
threadIdx
.
x
/
get_wa
rp
_size
();
}
__device__
index_t
get_block_1d_id
()
{
return
blockIdx
.
x
;
}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment