Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
55d73548
Commit
55d73548
authored
Nov 01, 2023
by
Bartlomiej Wroblewski
Browse files
Add comments
parent
91d13ef4
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
32 additions
and
2 deletions
+32
-2
include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_direct_load.hpp
.../block/thread_group_tensor_slice_transfer_direct_load.hpp
+30
-1
include/ck/utility/amd_buffer_addressing.hpp
include/ck/utility/amd_buffer_addressing.hpp
+1
-1
include/ck/utility/dynamic_buffer.hpp
include/ck/utility/dynamic_buffer.hpp
+1
-0
No files found.
include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_direct_load.hpp
View file @
55d73548
...
@@ -11,7 +11,36 @@
...
@@ -11,7 +11,36 @@
namespace
ck
{
namespace
ck
{
// TODO: Write the description.
/**
* Transfer that uses direct load instructions to copy data from global to LDS memory.
*
* Traditional loads first copy data from global to registers, and then from registers to LDS.
* Direct loads do not need an intermediate step, data is copied directly from global to LDS,
* without the use of additional registers.
*
* However, the instruction has limitations:
* - each thread must copy exactly a single DWORD - 4 bytes;
* - threads within a single wavefront must write consecutive DWORDS into LDS,
* (data in global do not need to be contiguous, each thread might have its own offset).
*
* To make sure that all the transfers finished, the `waitcnt` instruction must be used with
* `vmcnt` instead of `lgkmcnt`.
*
* Limitations of the transfer class:
* - `SrcData` must be the same as `DstData` - no possibility to convert the data type in flight;
* - `DstVectorDim` must be the last dimension;
* - `SrcVectorDim` must be the last dimension if `ScalarPerVector` is greater than 1;
* - `ScalarPerVector` times the number of bytes of `DstData` must be equal to a single DWORD = 4B
* (for examlpe if `DstData` is fp32, then `ScalarPerVector` must be 1; if `DstData` is fp16,
* `ScalarPerVector` must be 2);
* - if `ScalarPerVector` is greater than 1, the contiguous dimension in src and dst must be
* the same dimension;
* - threads in a wavefront must write contiguous data to LDS (when wavefront size is 64,
* they must write 64 contiguous DWORDs) - `ThreadClusterLengths` must be prepared in such a way
* to guarantee that.
*
* For now, only single LDS buffer is supported.
*/
template
<
typename
ThreadGroup
,
template
<
typename
ThreadGroup
,
typename
BlockSliceLengths
,
typename
BlockSliceLengths
,
typename
ThreadClusterLengths
,
typename
ThreadClusterLengths
,
...
...
include/ck/utility/amd_buffer_addressing.hpp
View file @
55d73548
...
@@ -962,7 +962,7 @@ __device__ void amd_direct_load_global_to_lds(const T* global_base_ptr,
...
@@ -962,7 +962,7 @@ __device__ void amd_direct_load_global_to_lds(const T* global_base_ptr,
const
bool
is_valid
,
const
bool
is_valid
,
const
index_t
src_element_space_size
)
const
index_t
src_element_space_size
)
{
{
// Direct loads require that each thread
writes
a single DWORD.
// Direct loads require that each thread
reads and writes exactly
a single DWORD.
constexpr
auto
dword_bytes
=
4
;
constexpr
auto
dword_bytes
=
4
;
constexpr
auto
bytes_per_thread
=
sizeof
(
T
)
*
NumElemsPerThread
;
constexpr
auto
bytes_per_thread
=
sizeof
(
T
)
*
NumElemsPerThread
;
static_assert
(
bytes_per_thread
==
dword_bytes
);
static_assert
(
bytes_per_thread
==
dword_bytes
);
...
...
include/ck/utility/dynamic_buffer.hpp
View file @
55d73548
...
@@ -177,6 +177,7 @@ struct DynamicBuffer
...
@@ -177,6 +177,7 @@ struct DynamicBuffer
__host__
__device__
void
__host__
__device__
void
CopyTo
(
DstBuffer
&
dst_buf
,
index_t
src_offset
,
index_t
dst_offset
,
bool
is_valid_element
)
const
CopyTo
(
DstBuffer
&
dst_buf
,
index_t
src_offset
,
index_t
dst_offset
,
bool
is_valid_element
)
const
{
{
// Copy data from global to LDS memory using direct loads.
static_assert
(
GetAddressSpace
()
==
AddressSpaceEnum
::
Global
,
static_assert
(
GetAddressSpace
()
==
AddressSpaceEnum
::
Global
,
"Source data must come from a global memory buffer."
);
"Source data must come from a global memory buffer."
);
static_assert
(
DstBuffer
::
GetAddressSpace
()
==
AddressSpaceEnum
::
Lds
,
static_assert
(
DstBuffer
::
GetAddressSpace
()
==
AddressSpaceEnum
::
Lds
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment