Commit 5af722b4 authored by Chao Liu's avatar Chao Liu
Browse files

update buffer addressing

parent c68d9147
......@@ -35,44 +35,17 @@ __llvm_amdgcn_buffer_load_f32x4(int32x4_t srsrc,
index_t offset,
bool glc,
bool slc) __asm("llvm.amdgcn.buffer.load.v4f32");
__device__ half_t __llvm_amdgcn_buffer_load_f16(int32x4_t srsrc,
index_t vindex,
index_t offset,
bool glc,
bool slc) __asm("llvm.amdgcn.buffer.load.f16");
__device__ half2_t __llvm_amdgcn_buffer_load_f16x2(int32x4_t srsrc,
index_t vindex,
index_t offset,
bool glc,
bool slc) __asm("llvm.amdgcn.buffer.load.v2f16");
__device__ half4_t __llvm_amdgcn_buffer_load_f16x4(int32x4_t srsrc,
index_t vindex,
index_t offset,
bool glc,
bool slc) __asm("llvm.amdgcn.buffer.load.v4f16");
__device__ ushort __llvm_amdgcn_buffer_load_bf16(int32x4_t srsrc,
index_t vindex,
index_t offset,
bool glc,
bool slc) __asm("llvm.amdgcn.buffer.load.bf16");
__device__ ushort2_t
__llvm_amdgcn_buffer_load_bf16x2(int32x4_t srsrc,
index_t vindex,
index_t offset,
bool glc,
bool slc) __asm("llvm.amdgcn.buffer.load.v2bf16");
__device__ ushort4_t
__llvm_amdgcn_buffer_load_bf16x4(int32x4_t srsrc,
index_t vindex,
index_t offset,
bool glc,
bool slc) __asm("llvm.amdgcn.buffer.load.v4bf16");
__device__ half_t
__llvm_amdgcn_raw_buffer_load_f16(int32x4_t rsrc,
index_t voffset,
index_t soffset,
index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.f16");
__device__ ushort
__llvm_amdgcn_raw_buffer_load_bf16(int32x4_t rsrc,
index_t voffset,
index_t soffset,
index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.bf16");
__device__ void __llvm_amdgcn_buffer_store_f32(float vdata,
int32x4_t srsrc,
......@@ -95,56 +68,33 @@ __device__ void __llvm_amdgcn_buffer_store_f32x4(float4_t vdata,
bool glc,
bool slc) __asm("llvm.amdgcn.buffer.store.v4f32");
__device__ void __llvm_amdgcn_buffer_store_f16(half_t vdata,
int32x4_t srsrc,
index_t vindex,
index_t offset,
bool glc,
bool slc) __asm("llvm.amdgcn.buffer.store.f16");
__device__ void __llvm_amdgcn_buffer_store_f16x2(half2_t vdata,
int32x4_t srsrc,
index_t vindex,
index_t offset,
bool glc,
bool slc) __asm("llvm.amdgcn.buffer.store.v2f16");
__device__ void __llvm_amdgcn_buffer_store_f16x4(half4_t vdata,
int32x4_t srsrc,
index_t vindex,
index_t offset,
bool glc,
bool slc) __asm("llvm.amdgcn.buffer.store.v4f16");
__device__ void __llvm_amdgcn_buffer_store_bf16(ushort vdata,
int32x4_t srsrc,
index_t vindex,
index_t offset,
bool glc,
bool slc) __asm("llvm.amdgcn.buffer.store.bf16");
__device__ void
__llvm_amdgcn_buffer_store_bf16x2(ushort2_t vdata,
int32x4_t srsrc,
index_t vindex,
index_t offset,
bool glc,
bool slc) __asm("llvm.amdgcn.buffer.store.v2bf16");
__llvm_amdgcn_raw_buffer_store_f16(half_t vdata,
int32x4_t rsrc,
index_t voffset,
index_t soffset,
index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.f16");
__device__ void
__llvm_amdgcn_buffer_store_bf16x4(ushort4_t vdata,
int32x4_t srsrc,
index_t vindex,
index_t offset,
bool glc,
bool slc) __asm("llvm.amdgcn.buffer.store.v4bf16");
__llvm_amdgcn_raw_buffer_store_bf16(ushort vdata,
int32x4_t rsrc,
index_t voffset,
index_t soffset,
index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.bf16");
#if CK_USE_AMD_BUFFER_ATOMIC_FADD
#if CK_HIP_VERSION_FLAT >= 3010020405
// starting ROCm-3.10, the return type becomes float
__device__ float
#else
__device__ void
#endif
__llvm_amdgcn_buffer_atomic_add_f32(float vdata,
int32x4_t srsrc,
int32x4_t rsrc,
index_t vindex,
index_t offset,
bool slc) __asm("llvm.amdgcn.buffer.atomic.fadd.f32");
#endif
// buffer_load requires:
// 1) p_src_thread must be in global memory space, p_dst_thread must be vgpr
......@@ -196,20 +146,11 @@ __device__ float amd_buffer_load<float, 1>(const float* p_src_wave,
index_t src_thread_addr_offset = src_thread_data_offset * sizeof(float);
#if 1 // debug
#if !CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK
return __llvm_amdgcn_buffer_load_f32(src_wave_buffer_resource.data,
0,
src_thread_data_valid ? src_thread_addr_offset
: 0xffffffff,
false,
false);
#else
#if CK_EXPERIMENTAL_USE_BUFFER_ADDRESS_OOB_CHECK
uint32_t src_addr_shift = src_thread_data_valid ? 0 : 0x7fffffff;
return __llvm_amdgcn_buffer_load_f32(
src_wave_buffer_resource.data, 0, src_addr_shift + src_thread_addr_offset, false, false);
#endif
#else
return src_thread_data_valid
? __llvm_amdgcn_buffer_load_f32(
......@@ -235,18 +176,18 @@ __device__ float2_t amd_buffer_load<float, 2>(const float* p_src_wave,
index_t src_thread_addr_offset = src_thread_data_offset * sizeof(float);
#if !CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK
return __llvm_amdgcn_buffer_load_f32x2(src_wave_buffer_resource.data,
0,
src_thread_data_valid ? src_thread_addr_offset
: 0xffffffff,
false,
false);
#else
#if CK_EXPERIMENTAL_USE_BUFFER_ADDRESS_OOB_CHECK
uint32_t src_addr_shift = src_thread_data_valid ? 0 : 0x7fffffff;
return __llvm_amdgcn_buffer_load_f32x2(
src_wave_buffer_resource.data, 0, src_addr_shift + src_thread_addr_offset, false, false);
#else
float2_t zeros(0);
return src_thread_data_valid
? __llvm_amdgcn_buffer_load_f32x2(
src_wave_buffer_resource.data, 0, src_thread_addr_offset, false, false)
: zeros;
#endif
}
......@@ -267,18 +208,18 @@ __device__ float4_t amd_buffer_load<float, 4>(const float* p_src_wave,
index_t src_thread_addr_offset = src_thread_data_offset * sizeof(float);
#if !CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK
return __llvm_amdgcn_buffer_load_f32x4(src_wave_buffer_resource.data,
0,
src_thread_data_valid ? src_thread_addr_offset
: 0xffffffff,
false,
false);
#else
#if CK_EXPERIMENTAL_USE_BUFFER_ADDRESS_OOB_CHECK
uint32_t src_addr_shift = src_thread_data_valid ? 0 : 0x7fffffff;
return __llvm_amdgcn_buffer_load_f32x4(
src_wave_buffer_resource.data, 0, src_addr_shift + src_thread_addr_offset, false, false);
#else
float4_t zeros(0);
return src_thread_data_valid
? __llvm_amdgcn_buffer_load_f32x4(
src_wave_buffer_resource.data, 0, src_thread_addr_offset, false, false)
: zeros;
#endif
}
......@@ -297,25 +238,24 @@ __device__ half_t amd_buffer_load<half_t, 1>(const half_t* p_src_wave,
// wavewise setting (32 bit)
src_wave_buffer_resource.config[3] = 0x00027000;
#if !CK_WORKAROUND_SWDEV_231101
index_t src_thread_addr_offset = src_thread_data_offset * sizeof(half_t);
#if !CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK
return __llvm_amdgcn_buffer_load_f16(src_wave_buffer_resource.data,
0,
src_thread_data_valid ? src_thread_addr_offset
: 0xffffffff,
false,
false);
#else
#if CK_EXPERIMENTAL_USE_BUFFER_ADDRESS_OOB_CHECK
uint32_t src_addr_shift = src_thread_data_valid ? 0 : 0x7fffffff;
return __llvm_amdgcn_buffer_load_f16(
src_wave_buffer_resource.data, 0, src_addr_shift + src_thread_addr_offset, false, false);
#endif
// current code cannot isolate Soffset and Voffset, so Soffset is hard-coded to 0, and
// everything is passed to Voffset
return __llvm_amdgcn_raw_buffer_load_f16(
src_wave_buffer_resource.data, src_addr_shift + src_thread_data_offset, 0, 0);
#else
return src_thread_data_valid ? p_src_wave[src_thread_data_offset] : 0;
#endif
half_t zero(0);
// current code cannot isolate Soffset and Voffset, so Soffset is hard-coded to 0, and
// everything is passed to Voffset
return src_thread_data_valid ? __llvm_amdgcn_raw_buffer_load_f16(
src_wave_buffer_resource.data, src_thread_data_offset, 0, 0)
: zero;
#endif // CK_EXPERIMENTAL_USE_BUFFER_ADDRESS_OOB_CHECK
}
template <>
......@@ -335,21 +275,21 @@ __device__ half2_t amd_buffer_load<half_t, 2>(const half_t* p_src_wave,
index_t src_thread_addr_offset = src_thread_data_offset * sizeof(half_t);
#if !CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK
float dst_out_tmp =
__llvm_amdgcn_buffer_load_f32(src_wave_buffer_resource.data,
0,
src_thread_data_valid ? src_thread_addr_offset : 0xffffffff,
false,
false);
#else
#if CK_EXPERIMENTAL_USE_BUFFER_ADDRESS_OOB_CHECK
uint32_t src_addr_shift = src_thread_data_valid ? 0 : 0x7fffffff;
float dst_out_tmp = __llvm_amdgcn_buffer_load_f32(
src_wave_buffer_resource.data, 0, src_addr_shift + src_thread_addr_offset, false, false);
#endif
return *reinterpret_cast<half2_t*>(&dst_out_tmp);
#else
half2_t zeros(0);
float dst_out_tmp = __llvm_amdgcn_buffer_load_f32(
src_wave_buffer_resource.data, 0, src_thread_addr_offset, false, false);
return src_thread_data_valid ? *reinterpret_cast<half2_t*>(&dst_out_tmp) : zeros;
#endif
}
template <>
......@@ -369,21 +309,21 @@ __device__ half4_t amd_buffer_load<half_t, 4>(const half_t* p_src_wave,
index_t src_thread_addr_offset = src_thread_data_offset * sizeof(half_t);
#if !CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK
float2_t dst_out_tmp =
__llvm_amdgcn_buffer_load_f32x2(src_wave_buffer_resource.data,
0,
src_thread_data_valid ? src_thread_addr_offset : 0xffffffff,
false,
false);
#else
#if CK_EXPERIMENTAL_USE_BUFFER_ADDRESS_OOB_CHECK
uint32_t src_addr_shift = src_thread_data_valid ? 0 : 0x7fffffff;
float2_t dst_out_tmp = __llvm_amdgcn_buffer_load_f32x2(
src_wave_buffer_resource.data, 0, src_addr_shift + src_thread_addr_offset, false, false);
#endif
return *reinterpret_cast<half4_t*>(&dst_out_tmp);
#else
half4_t zeros(0);
float2_t dst_out_tmp = __llvm_amdgcn_buffer_load_f32x2(
src_wave_buffer_resource.data, 0, src_thread_addr_offset, false, false);
return src_thread_data_valid ? *reinterpret_cast<half4_t*>(&dst_out_tmp) : zeros;
#endif
}
template <>
......@@ -403,21 +343,21 @@ __device__ half8_t amd_buffer_load<half_t, 8>(const half_t* p_src_wave,
index_t src_thread_addr_offset = src_thread_data_offset * sizeof(half_t);
#if !CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK
float4_t dst_out_tmp =
__llvm_amdgcn_buffer_load_f32x4(src_wave_buffer_resource.data,
0,
src_thread_data_valid ? src_thread_addr_offset : 0xffffffff,
false,
false);
#else
#if CK_EXPERIMENTAL_USE_BUFFER_ADDRESS_OOB_CHECK
uint32_t src_addr_shift = src_thread_data_valid ? 0 : 0x7fffffff;
float4_t dst_out_tmp = __llvm_amdgcn_buffer_load_f32x4(
src_wave_buffer_resource.data, 0, src_addr_shift + src_thread_addr_offset, false, false);
#endif
return *reinterpret_cast<half8_t*>(&dst_out_tmp);
#else
half8_t zeros(0);
float4_t dst_out_tmp = __llvm_amdgcn_buffer_load_f32x4(
src_wave_buffer_resource.data, 0, src_thread_addr_offset, false, false);
return src_thread_data_offset ? *reinterpret_cast<half8_t*>(&dst_out_tmp) : zeros;
#endif
}
template <>
......@@ -435,25 +375,23 @@ __device__ ushort amd_buffer_load<ushort, 1>(const ushort* p_src_wave,
// wavewise setting (32 bit)
src_wave_buffer_resource.config[3] = 0x00027000;
#if !CK_WORKAROUND_SWDEV_231101
index_t src_thread_addr_offset = src_thread_data_offset * sizeof(ushort);
#if !CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK
return __llvm_amdgcn_buffer_load_bf16(src_wave_buffer_resource.data,
0,
src_thread_data_valid ? src_thread_addr_offset
: 0xffffffff,
false,
false);
#else
#if CK_EXPERIMENTAL_USE_BUFFER_ADDRESS_OOB_CHECK
uint32_t src_addr_shift = src_thread_data_valid ? 0 : 0x7fffffff;
return __llvm_amdgcn_buffer_load_bf16(
src_wave_buffer_resource.data, 0, src_addr_shift + src_thread_addr_offset, false, false);
#endif
// current code cannot isolate Soffset and Voffset, so Soffset is hard-coded to 0, and
// everything is passed to Voffset
return __llvm_amdgcn_raw_buffer_load_bf16(
src_wave_buffer_resource.data, src_addr_shift + src_thread_addr_offset, 0, 0);
#else
return src_thread_data_valid ? p_src_wave[src_thread_data_offset] : 0;
ushort_t zero(0);
// current code cannot isolate Soffset and Voffset, so Soffset is hard-coded to 0, and
// everything is passed to Voffset
return src_thread_data_valid ? __llvm_amdgcn_raw_buffer_load_bf16(
src_wave_buffer_resource.data, src_thread_addr_offset, 0, 0)
: zero;
#endif
}
......@@ -474,21 +412,21 @@ __device__ ushort2_t amd_buffer_load<ushort, 2>(const ushort* p_src_wave,
index_t src_thread_addr_offset = src_thread_data_offset * sizeof(ushort);
#if !CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK
float dst_out_tmp =
__llvm_amdgcn_buffer_load_f32(src_wave_buffer_resource.data,
0,
src_thread_data_valid ? src_thread_addr_offset : 0xffffffff,
false,
false);
#else
#if CK_EXPERIMENTAL_USE_BUFFER_ADDRESS_OOB_CHECK
uint32_t src_addr_shift = src_thread_data_valid ? 0 : 0x7fffffff;
float dst_out_tmp = __llvm_amdgcn_buffer_load_f32(
src_wave_buffer_resource.data, 0, src_addr_shift + src_thread_addr_offset, false, false);
#endif
return *reinterpret_cast<ushort2_t*>(&dst_out_tmp);
#else
ushort2_t zeros(0);
float dst_out_tmp = __llvm_amdgcn_buffer_load_f32(
src_wave_buffer_resource.data, 0, src_thread_addr_offset, false, false);
return src_thread_data_valid ? *reinterpret_cast<ushort2_t*>(&dst_out_tmp) : zeros;
#endif
}
template <>
......@@ -508,21 +446,21 @@ __device__ ushort4_t amd_buffer_load<ushort, 4>(const ushort* p_src_wave,
index_t src_thread_addr_offset = src_thread_data_offset * sizeof(ushort);
#if !CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK
float2_t dst_out_tmp =
__llvm_amdgcn_buffer_load_f32x2(src_wave_buffer_resource.data,
0,
src_thread_data_valid ? src_thread_addr_offset : 0xffffffff,
false,
false);
#else
#if CK_EXPERIMENTAL_USE_BUFFER_ADDRESS_OOB_CHECK
uint32_t src_addr_shift = src_thread_data_valid ? 0 : 0x7fffffff;
float2_t dst_out_tmp = __llvm_amdgcn_buffer_load_f32x2(
src_wave_buffer_resource.data, 0, src_addr_shift + src_thread_addr_offset, false, false);
#endif
return *reinterpret_cast<ushort4_t*>(&dst_out_tmp);
#else
ushort4_t zeros(0);
float2_t dst_out_tmp = __llvm_amdgcn_buffer_load_f32x2(
src_wave_buffer_resource.data, 0, src_thread_addr_offset, false, false);
return src_thread_data_valid ? *reinterpret_cast<ushort4_t*>(&dst_out_tmp) : zeros;
#endif
}
template <>
......@@ -542,21 +480,21 @@ __device__ ushort8_t amd_buffer_load<ushort, 8>(const ushort* p_src_wave,
index_t src_thread_addr_offset = src_thread_data_offset * sizeof(ushort);
#if !CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK
float4_t dst_out_tmp =
__llvm_amdgcn_buffer_load_f32x4(src_wave_buffer_resource.data,
0,
src_thread_data_valid ? src_thread_addr_offset : 0xffffffff,
false,
false);
#else
#if CK_EXPERIMENTAL_USE_BUFFER_ADDRESS_OOB_CHECK
uint32_t src_addr_shift = src_thread_data_valid ? 0 : 0x7fffffff;
float4_t dst_out_tmp = __llvm_amdgcn_buffer_load_f32x4(
src_wave_buffer_resource.data, 0, src_addr_shift + src_thread_addr_offset, false, false);
#endif
return *reinterpret_cast<ushort8_t*>(&dst_out_tmp);
#else
ushort8_t zeros(0);
float4_t dst_out_tmp = __llvm_amdgcn_buffer_load_f32x4(
src_wave_buffer_resource.data, 0, src_thread_addr_offset, false, false);
return src_thread_data_offset ? *reinterpret_cast<ushort8_t*>(&dst_out_tmp) : zeros;
#endif
}
template <>
......@@ -577,15 +515,7 @@ __device__ void amd_buffer_store<float, 1>(const float* p_src_thread,
index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float);
#if 1 // debug
#if !CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK
__llvm_amdgcn_buffer_store_f32(*p_src_thread,
dst_wave_buffer_resource.data,
0,
dst_thread_data_valid ? dst_thread_addr_offset : 0xffffffff,
false,
false);
#else
#if CK_EXPERIMENTAL_USE_BUFFER_ADDRESS_OOB_CHECK
uint32_t dst_addr_shift = dst_thread_data_valid ? 0 : 0x7fffffff;
__llvm_amdgcn_buffer_store_f32(*p_src_thread,
......@@ -594,7 +524,6 @@ __device__ void amd_buffer_store<float, 1>(const float* p_src_thread,
dst_addr_shift + dst_thread_addr_offset,
false,
false);
#endif
#else
if(dst_thread_data_valid)
{
......@@ -622,14 +551,7 @@ __device__ void amd_buffer_store<float, 2>(const float* p_src_thread,
index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float);
#if !CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK
__llvm_amdgcn_buffer_store_f32x2(*reinterpret_cast<const float2_t*>(p_src_thread),
dst_wave_buffer_resource.data,
0,
dst_thread_data_valid ? dst_thread_addr_offset : 0xffffffff,
false,
false);
#else
#if CK_EXPERIMENTAL_USE_BUFFER_ADDRESS_OOB_CHECK
uint32_t dst_addr_shift = dst_thread_data_valid ? 0 : 0x7fffffff;
__llvm_amdgcn_buffer_store_f32x2(*reinterpret_cast<const float2_t*>(p_src_thread),
......@@ -638,6 +560,16 @@ __device__ void amd_buffer_store<float, 2>(const float* p_src_thread,
dst_addr_shift + dst_thread_addr_offset,
false,
false);
#else
if(dst_thread_data_valid)
{
__llvm_amdgcn_buffer_store_f32x2(*reinterpret_cast<const float2_t*>(p_src_thread),
dst_wave_buffer_resource.data,
0,
dst_thread_addr_offset,
false,
false);
}
#endif
}
......@@ -659,14 +591,7 @@ __device__ void amd_buffer_store<float, 4>(const float* p_src_thread,
index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float);
#if !CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK
__llvm_amdgcn_buffer_store_f32x4(*reinterpret_cast<const float4_t*>(p_src_thread),
dst_wave_buffer_resource.data,
0,
dst_thread_data_valid ? dst_thread_addr_offset : 0xffffffff,
false,
false);
#else
#if CK_EXPERIMENTAL_USE_BUFFER_ADDRESS_OOB_CHECK
uint32_t dst_addr_shift = dst_thread_data_valid ? 0 : 0x7fffffff;
__llvm_amdgcn_buffer_store_f32x4(*reinterpret_cast<const float4_t*>(p_src_thread),
......@@ -675,6 +600,16 @@ __device__ void amd_buffer_store<float, 4>(const float* p_src_thread,
dst_addr_shift + dst_thread_addr_offset,
false,
false);
#else
if(dst_thread_data_valid)
{
__llvm_amdgcn_buffer_store_f32x4(*reinterpret_cast<const float4_t*>(p_src_thread),
dst_wave_buffer_resource.data,
0,
dst_thread_addr_offset,
false,
false);
}
#endif
}
......@@ -694,31 +629,25 @@ __device__ void amd_buffer_store<half_t, 1>(const half_t* p_src_thread,
// wavewise setting (32 bit)
dst_wave_buffer_resource.config[3] = 0x00027000;
#if !CK_WORKAROUND_SWDEV_231101
index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(half_t);
#if !CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK
__llvm_amdgcn_buffer_store_f16(*p_src_thread,
dst_wave_buffer_resource.data,
0,
dst_thread_data_valid ? dst_thread_addr_offset : 0xffffffff,
false,
false);
#else
#if CK_EXPERIMENTAL_USE_BUFFER_ADDRESS_OOB_CHECK
uint32_t dst_addr_shift = dst_thread_data_valid ? 0 : 0x7fffffff;
__llvm_amdgcn_buffer_store_f16(*p_src_thread,
dst_wave_buffer_resource.data,
0,
dst_addr_shift + dst_thread_addr_offset,
false,
false);
#endif
// current code cannot isolate Soffset and Voffset, so Soffset is hard-coded to 0, and
// everything is passed to Voffset
__llvm_amdgcn_raw_buffer_store_f16(*p_src_thread,
dst_wave_buffer_resource.data,
dst_addr_shift + dst_thread_addr_offset,
0,
0);
#else
if(dst_thread_data_valid)
{
p_dst_wave[dst_thread_data_offset] = *p_src_thread;
// current code cannot isolate Soffset and Voffset, so Soffset is hard-coded to 0, and
// everything is passed to Voffset
__llvm_amdgcn_raw_buffer_store_f16(
*p_src_thread, dst_wave_buffer_resource.data, dst_thread_addr_offset, 0, 0);
}
#endif
}
......@@ -743,14 +672,7 @@ __device__ void amd_buffer_store<half_t, 2>(const half_t* p_src_thread,
const float* p_src_tmp = reinterpret_cast<const float*>(p_src_thread);
#if !CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK
__llvm_amdgcn_buffer_store_f32(*p_src_tmp,
dst_wave_buffer_resource.data,
0,
dst_thread_data_valid ? dst_thread_addr_offset : 0xffffffff,
false,
false);
#else
#if CK_EXPERIMENTAL_USE_BUFFER_ADDRESS_OOB_CHECK
uint32_t dst_addr_shift = dst_thread_data_valid ? 0 : 0x7fffffff;
__llvm_amdgcn_buffer_store_f32(*p_src_tmp,
......@@ -759,6 +681,12 @@ __device__ void amd_buffer_store<half_t, 2>(const half_t* p_src_thread,
dst_addr_shift + dst_thread_addr_offset,
false,
false);
#else
if(dst_thread_data_valid)
{
__llvm_amdgcn_buffer_store_f32(
*p_src_tmp, dst_wave_buffer_resource.data, 0, dst_thread_addr_offset, false, false);
}
#endif
}
......@@ -782,14 +710,7 @@ __device__ void amd_buffer_store<half_t, 4>(const half_t* p_src_thread,
const float2_t* p_src_tmp = reinterpret_cast<const float2_t*>(p_src_thread);
#if !CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK
__llvm_amdgcn_buffer_store_f32x2(*p_src_tmp,
dst_wave_buffer_resource.data,
0,
dst_thread_data_valid ? dst_thread_addr_offset : 0xffffffff,
false,
false);
#else
#if CK_EXPERIMENTAL_USE_BUFFER_ADDRESS_OOB_CHECK
uint32_t dst_addr_shift = dst_thread_data_valid ? 0 : 0x7fffffff;
__llvm_amdgcn_buffer_store_f32x2(*p_src_tmp,
......@@ -798,6 +719,12 @@ __device__ void amd_buffer_store<half_t, 4>(const half_t* p_src_thread,
dst_addr_shift + dst_thread_addr_offset,
false,
false);
#else
if(dst_thread_data_valid)
{
__llvm_amdgcn_buffer_store_f32x2(
*p_src_tmp, dst_wave_buffer_resource.data, 0, dst_thread_addr_offset, false, false);
}
#endif
}
......@@ -821,14 +748,7 @@ __device__ void amd_buffer_store<half_t, 8>(const half_t* p_src_thread,
const float4_t* p_src_tmp = reinterpret_cast<const float4_t*>(p_src_thread);
#if !CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK
__llvm_amdgcn_buffer_store_f32x4(*p_src_tmp,
dst_wave_buffer_resource.data,
0,
dst_thread_data_valid ? dst_thread_addr_offset : 0xffffffff,
false,
false);
#else
#if CK_EXPERIMENTAL_USE_BUFFER_ADDRESS_OOB_CHECK
uint32_t dst_addr_shift = dst_thread_data_valid ? 0 : 0x7fffffff;
__llvm_amdgcn_buffer_store_f32x4(*p_src_tmp,
......@@ -837,6 +757,12 @@ __device__ void amd_buffer_store<half_t, 8>(const half_t* p_src_thread,
dst_addr_shift + dst_thread_addr_offset,
false,
false);
#else
if(dst_thread_data_valid)
{
__llvm_amdgcn_buffer_store_f32x4(
*p_src_tmp, dst_wave_buffer_resource.data, 0, dst_thread_addr_offset, false, false);
}
#endif
}
......@@ -856,31 +782,21 @@ __device__ void amd_buffer_store<ushort, 1>(const ushort* p_src_thread,
// wavewise setting (32 bit)
dst_wave_buffer_resource.config[3] = 0x00027000;
#if !CK_WORKAROUND_SWDEV_231101
index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(ushort);
#if !CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK
__llvm_amdgcn_buffer_store_bf16(*p_src_thread,
dst_wave_buffer_resource.data,
0,
dst_thread_data_valid ? dst_thread_addr_offset : 0xffffffff,
false,
false);
#else
#if CK_EXPERIMENTAL_USE_BUFFER_ADDRESS_OOB_CHECK
uint32_t dst_addr_shift = dst_thread_data_valid ? 0 : 0x7fffffff;
__llvm_amdgcn_buffer_store_bf16(*p_src_thread,
dst_wave_buffer_resource.data,
0,
dst_addr_shift + dst_thread_addr_offset,
false,
false);
#endif
__llvm_amdgcn_raw_buffer_store_bf16(*p_src_thread,
dst_wave_buffer_resource.data,
dst_addr_shift + dst_thread_addr_offset,
0,
0);
#else
if(dst_thread_data_valid)
{
p_dst_wave[dst_thread_data_offset] = *p_src_thread;
__llvm_amdgcn_raw_buffer_store_bf16(
*p_src_thread, dst_wave_buffer_resource.data, dst_thread_addr_offset, 0, 0);
}
#endif
}
......@@ -905,14 +821,7 @@ __device__ void amd_buffer_store<ushort, 2>(const ushort* p_src_thread,
const float* p_src_tmp = reinterpret_cast<const float*>(p_src_thread);
#if !CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK
__llvm_amdgcn_buffer_store_f32(*p_src_tmp,
dst_wave_buffer_resource.data,
0,
dst_thread_data_valid ? dst_thread_addr_offset : 0xffffffff,
false,
false);
#else
#if CK_EXPERIMENTAL_USE_BUFFER_ADDRESS_OOB_CHECK
uint32_t dst_addr_shift = dst_thread_data_valid ? 0 : 0x7fffffff;
__llvm_amdgcn_buffer_store_f32(*p_src_tmp,
......@@ -921,6 +830,12 @@ __device__ void amd_buffer_store<ushort, 2>(const ushort* p_src_thread,
dst_addr_shift + dst_thread_addr_offset,
false,
false);
#else
if(dst_thread_data_valid)
{
__llvm_amdgcn_buffer_store_f32(
*p_src_tmp, dst_wave_buffer_resource.data, 0, dst_thread_addr_offset, false, false);
}
#endif
}
......@@ -944,14 +859,7 @@ __device__ void amd_buffer_store<ushort, 4>(const ushort* p_src_thread,
const float2_t* p_src_tmp = reinterpret_cast<const float2_t*>(p_src_thread);
#if !CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK
__llvm_amdgcn_buffer_store_f32x2(*p_src_tmp,
dst_wave_buffer_resource.data,
0,
dst_thread_data_valid ? dst_thread_addr_offset : 0xffffffff,
false,
false);
#else
#if CK_EXPERIMENTAL_USE_BUFFER_ADDRESS_OOB_CHECK
uint32_t dst_addr_shift = dst_thread_data_valid ? 0 : 0x7fffffff;
__llvm_amdgcn_buffer_store_f32x2(*p_src_tmp,
......@@ -960,6 +868,12 @@ __device__ void amd_buffer_store<ushort, 4>(const ushort* p_src_thread,
dst_addr_shift + dst_thread_addr_offset,
false,
false);
#else
if(dst_thread_data_valid)
{
__llvm_amdgcn_buffer_store_f32x2(
*p_src_tmp, dst_wave_buffer_resource.data, 0, dst_thread_addr_offset, false, false);
}
#endif
}
......@@ -983,14 +897,7 @@ __device__ void amd_buffer_store<ushort, 8>(const ushort* p_src_thread,
const float4_t* p_src_tmp = reinterpret_cast<const float4_t*>(p_src_thread);
#if !CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK
__llvm_amdgcn_buffer_store_f32x4(*p_src_tmp,
dst_wave_buffer_resource.data,
0,
dst_thread_data_valid ? dst_thread_addr_offset : 0xffffffff,
false,
false);
#else
#if CK_EXPERIMENTAL_USE_BUFFER_ADDRESS_OOB_CHECK
uint32_t dst_addr_shift = dst_thread_data_valid ? 0 : 0x7fffffff;
__llvm_amdgcn_buffer_store_f32x4(*p_src_tmp,
......@@ -999,9 +906,16 @@ __device__ void amd_buffer_store<ushort, 8>(const ushort* p_src_thread,
dst_addr_shift + dst_thread_addr_offset,
false,
false);
#else
if(dst_thread_data_valid)
{
__llvm_amdgcn_buffer_store_f32x4(
*p_src_tmp, dst_wave_buffer_resource.data, 0, dst_thread_addr_offset, false, false);
}
#endif
}
#if CK_USE_AMD_BUFFER_ATOMIC_FADD
template <>
__device__ void amd_buffer_atomic_add<float, 1>(const float* p_src_thread,
float* p_dst_wave,
......@@ -1020,13 +934,7 @@ __device__ void amd_buffer_atomic_add<float, 1>(const float* p_src_thread,
index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float);
#if !CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK
__llvm_amdgcn_buffer_atomic_add_f32(*p_src_thread,
dst_wave_buffer_resource.data,
0,
dst_thread_data_valid ? dst_thread_addr_offset : 0xffffffff,
false);
#else
#if CK_EXPERIMENTAL_USE_BUFFER_ADDRESS_OOB_CHECK
uint32_t dst_addr_shift = dst_thread_data_valid ? 0 : 0x7fffffff;
__llvm_amdgcn_buffer_atomic_add_f32(*p_src_thread,
......@@ -1034,6 +942,12 @@ __device__ void amd_buffer_atomic_add<float, 1>(const float* p_src_thread,
0,
dst_addr_shift + dst_thread_addr_offset,
false);
#else
if(dst_thread_data_valid)
{
__llvm_amdgcn_buffer_atomic_add_f32(
*p_src_thread, dst_wave_buffer_resource.data, 0, dst_thread_addr_offset, false);
}
#endif
}
......@@ -1055,19 +969,9 @@ __device__ void amd_buffer_atomic_add<float, 2>(const float* p_src_thread,
index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float);
#if !CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK
for(index_t i = 0; i < 2; ++i)
{
__llvm_amdgcn_buffer_atomic_add_f32(
p_src_thread[i],
dst_wave_buffer_resource.data,
0,
dst_thread_data_valid ? (dst_thread_addr_offset + i * sizeof(float)) : 0xffffffff,
false);
}
#else
uint32_t dst_addr_shift = dst_thread_data_valid ? 0 : 0x7fffffff;
#if CK_EXPERIMENTAL_USE_BUFFER_ADDRESS_OOB_CHECK
for(index_t i = 0; i < 2; ++i)
{
__llvm_amdgcn_buffer_atomic_add_f32(p_src_thread[i],
......@@ -1077,6 +981,18 @@ __device__ void amd_buffer_atomic_add<float, 2>(const float* p_src_thread,
i * sizeof(float),
false);
}
#else
if(dst_thread_data_valid)
{
for(index_t i = 0; i < 2; ++i)
{
__llvm_amdgcn_buffer_atomic_add_f32(p_src_thread[i],
dst_wave_buffer_resource.data,
0,
dst_thread_addr_offset + i * sizeof(float),
false);
}
}
#endif
}
......@@ -1098,17 +1014,7 @@ __device__ void amd_buffer_atomic_add<float, 4>(const float* p_src_thread,
index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float);
#if !CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK
for(index_t i = 0; i < 4; ++i)
{
__llvm_amdgcn_buffer_atomic_add_f32(
p_src_thread[i],
dst_wave_buffer_resource.data,
0,
dst_thread_data_valid ? (dst_thread_addr_offset + i * sizeof(float)) : 0xffffffff,
false);
}
#else
#if CK_EXPERIMENTAL_USE_BUFFER_ADDRESS_OOB_CHECK
uint32_t dst_addr_shift = dst_thread_data_valid ? 0 : 0x7fffffff;
for(index_t i = 0; i < 4; ++i)
......@@ -1120,8 +1026,21 @@ __device__ void amd_buffer_atomic_add<float, 4>(const float* p_src_thread,
i * sizeof(float),
false);
}
#else
if(dst_thread_data_valid)
{
for(index_t i = 0; i < 4; ++i)
{
__llvm_amdgcn_buffer_atomic_add_f32(p_src_thread[i],
dst_wave_buffer_resource.data,
0,
dst_thread_addr_offset + i * sizeof(float),
false);
}
}
#endif
}
#endif // CK_USE_AMD_BUFFER_ATOMIC_FADD
} // namespace ck
#endif
......@@ -29,8 +29,8 @@
#endif
// only gfx908 support native floating point atomic add
#ifndef CK_USE_AMD_BUFFER_ATOMIC_ADD
#define CK_USE_AMD_BUFFER_ATOMIC_ADD 0
#ifndef CK_USE_AMD_BUFFER_ATOMIC_FADD
#define CK_USE_AMD_BUFFER_ATOMIC_FADD 0
#endif
// AMD XDLOPS
......@@ -52,8 +52,8 @@
#endif
// experimental implementation
#ifndef CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK
#define CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK 1
#ifndef CK_EXPERIMENTAL_USE_BUFFER_ADDRESS_OOB_CHECK
#define CK_EXPERIMENTAL_USE_BUFFER_ADDRESS_OOB_CHECK 1
#endif
#ifndef CK_EXPERIMENTAL_BLOCKWISE_GEMM_USE_PIPELINE
......@@ -73,10 +73,6 @@
#ifndef CK_WORKAROUND_SWDEV_229564
#define CK_WORKAROUND_SWDEV_229564 1
#endif
// workaround for buffer load/store fp16/bfp16 intrinsic bug
#ifndef CK_WORKAROUND_SWDEV_231101
#define CK_WORKAROUND_SWDEV_231101 1
#endif
namespace ck {
......
......@@ -187,7 +187,7 @@ void device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc,
constexpr index_t GemmBBlockCopyDstDataPerWrite_GemmN = 4;
constexpr index_t GemmCThreadCopyDstDataPerWrite_GemmN1 = 4;
#elif 0
#elif 1
// cdata = 64, BlockSize = 256, 128x128x16
constexpr index_t BlockSize = 256;
......
......@@ -51,10 +51,10 @@ int main(int argc, char* argv[])
#elif 1
// 3x3, 28x28
constexpr index_t N = 128;
constexpr index_t C = 256;
constexpr index_t C = 128;
constexpr index_t HI = 28;
constexpr index_t WI = 28;
constexpr index_t K = 1024;
constexpr index_t K = 128;
constexpr index_t Y = 3;
constexpr index_t X = 3;
......@@ -245,7 +245,7 @@ int main(int argc, char* argv[])
device_convolution_backward_data_implicit_gemm_v1r1_nchw_kcyx_nkhw
#elif 0
device_convolution_backward_data_implicit_gemm_v1r2_nchw_kcyx_nkhw
#elif 0
#elif 1
device_convolution_backward_data_implicit_gemm_v4r1_nchw_kcyx_nkhw
#elif 1
device_convolution_backward_data_implicit_gemm_v5r1_nhwc_kyxc_nhwk
......
......@@ -201,7 +201,7 @@ int main(int argc, char* argv[])
using LeftPads = Sequence<0, 0>;
using RightPads = Sequence<0, 0>;
#elif 1
#elif 0
// 3x3, 35x35, stride 2
constexpr index_t N = 128;
constexpr index_t C = 288;
......@@ -339,7 +339,7 @@ int main(int argc, char* argv[])
#elif 1
// 3x3, 28x28
constexpr index_t N = 128;
constexpr index_t C = 192;
constexpr index_t C = 128;
constexpr index_t HI = 28;
constexpr index_t WI = 28;
constexpr index_t K = 128;
......@@ -561,7 +561,7 @@ int main(int argc, char* argv[])
LeftPads{},
RightPads{},
nrepeat);
#elif 0
#elif 1
device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw(in_nchw_desc,
in_nchw,
wei_kcyx_desc,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment