Commit 55d73548 authored by Bartlomiej Wroblewski's avatar Bartlomiej Wroblewski
Browse files

Add comments

parent 91d13ef4
...@@ -11,7 +11,36 @@ ...@@ -11,7 +11,36 @@
namespace ck { namespace ck {
// TODO: Write the description. /**
* Transfer that uses direct load instructions to copy data from global to LDS memory.
*
* Traditional loads first copy data from global to registers, and then from registers to LDS.
* Direct loads do not need an intermediate step, data is copied directly from global to LDS,
* without the use of additional registers.
*
* However, the instruction has limitations:
* - each thread must copy exactly a single DWORD - 4 bytes;
* - threads within a single wavefront must write consecutive DWORDS into LDS,
* (data in global do not need to be contiguous, each thread might have its own offset).
*
* To make sure that all the transfers finished, the `waitcnt` instruction must be used with
* `vmcnt` instead of `lgkmcnt`.
*
* Limitations of the transfer class:
* - `SrcData` must be the same as `DstData` - no possibility to convert the data type in flight;
* - `DstVectorDim` must be the last dimension;
* - `SrcVectorDim` must be the last dimension if `ScalarPerVector` is greater than 1;
* - `ScalarPerVector` times the number of bytes of `DstData` must be equal to a single DWORD = 4B
* (for examlpe if `DstData` is fp32, then `ScalarPerVector` must be 1; if `DstData` is fp16,
* `ScalarPerVector` must be 2);
* - if `ScalarPerVector` is greater than 1, the contiguous dimension in src and dst must be
* the same dimension;
* - threads in a wavefront must write contiguous data to LDS (when wavefront size is 64,
* they must write 64 contiguous DWORDs) - `ThreadClusterLengths` must be prepared in such a way
* to guarantee that.
*
* For now, only single LDS buffer is supported.
*/
template <typename ThreadGroup, template <typename ThreadGroup,
typename BlockSliceLengths, typename BlockSliceLengths,
typename ThreadClusterLengths, typename ThreadClusterLengths,
......
...@@ -962,7 +962,7 @@ __device__ void amd_direct_load_global_to_lds(const T* global_base_ptr, ...@@ -962,7 +962,7 @@ __device__ void amd_direct_load_global_to_lds(const T* global_base_ptr,
const bool is_valid, const bool is_valid,
const index_t src_element_space_size) const index_t src_element_space_size)
{ {
// Direct loads require that each thread writes a single DWORD. // Direct loads require that each thread reads and writes exactly a single DWORD.
constexpr auto dword_bytes = 4; constexpr auto dword_bytes = 4;
constexpr auto bytes_per_thread = sizeof(T) * NumElemsPerThread; constexpr auto bytes_per_thread = sizeof(T) * NumElemsPerThread;
static_assert(bytes_per_thread == dword_bytes); static_assert(bytes_per_thread == dword_bytes);
......
...@@ -177,6 +177,7 @@ struct DynamicBuffer ...@@ -177,6 +177,7 @@ struct DynamicBuffer
__host__ __device__ void __host__ __device__ void
CopyTo(DstBuffer& dst_buf, index_t src_offset, index_t dst_offset, bool is_valid_element) const CopyTo(DstBuffer& dst_buf, index_t src_offset, index_t dst_offset, bool is_valid_element) const
{ {
// Copy data from global to LDS memory using direct loads.
static_assert(GetAddressSpace() == AddressSpaceEnum::Global, static_assert(GetAddressSpace() == AddressSpaceEnum::Global,
"Source data must come from a global memory buffer."); "Source data must come from a global memory buffer.");
static_assert(DstBuffer::GetAddressSpace() == AddressSpaceEnum::Lds, static_assert(DstBuffer::GetAddressSpace() == AddressSpaceEnum::Lds,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment