Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
7a3d9697
Commit
7a3d9697
authored
Jun 28, 2020
by
Chao Liu
Browse files
buffer addressing use offset trick
parent
3b07df08
Changes
3
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
734 additions
and
411 deletions
+734
-411
composable_kernel/include/utility/amd_buffer_addressing.hpp
composable_kernel/include/utility/amd_buffer_addressing.hpp
+694
-392
composable_kernel/include/utility/config.amd.hpp.in
composable_kernel/include/utility/config.amd.hpp.in
+6
-5
composable_kernel/include/utility/in_memory_operation.amd.hpp.in
...ble_kernel/include/utility/in_memory_operation.amd.hpp.in
+34
-14
No files found.
composable_kernel/include/utility/amd_buffer_addressing.hpp
View file @
7a3d9697
...
@@ -5,118 +5,118 @@
...
@@ -5,118 +5,118 @@
namespace
ck
{
namespace
ck
{
// For 128bit SGPRs
in buffer_load and
buffer
_store
instructions
// For 128
bit SGPRs
to supply resource constant in
buffer instructions
// https://rocm-documentation.readthedocs.io/en/latest/GCN_ISA_Manuals/testdocbook.html#vector-memory-buffer-instructions
// https://rocm-documentation.readthedocs.io/en/latest/GCN_ISA_Manuals/testdocbook.html#vector-memory-buffer-instructions
template
<
typename
T
>
template
<
typename
T
>
union
Buffer
AddressConfig
union
Buffer
ResourceConstant
{
{
int32x4_t
data
;
int32x4_t
data
;
T
*
address
[
2
];
T
*
address
[
2
];
int32_t
range
[
4
];
int32_t
range
[
4
];
};
};
__device__
float
__llvm_amdgcn_buffer_load_f32
(
int32x4_t
rsrc
,
__device__
float
__llvm_amdgcn_buffer_load_f32
(
int32x4_t
s
rsrc
,
index_t
vindex
,
index_t
vindex
,
index_t
offset
,
index_t
offset
,
bool
glc
,
bool
glc
,
bool
slc
)
__asm
(
"llvm.amdgcn.buffer.load.f32"
);
bool
slc
)
__asm
(
"llvm.amdgcn.buffer.load.f32"
);
__device__
float2_t
__device__
float2_t
__llvm_amdgcn_buffer_load_f32x2
(
int32x4_t
rsrc
,
__llvm_amdgcn_buffer_load_f32x2
(
int32x4_t
s
rsrc
,
index_t
vindex
,
index_t
vindex
,
index_t
offset
,
index_t
offset
,
bool
glc
,
bool
glc
,
bool
slc
)
__asm
(
"llvm.amdgcn.buffer.load.v2f32"
);
bool
slc
)
__asm
(
"llvm.amdgcn.buffer.load.v2f32"
);
__device__
float4_t
__device__
float4_t
__llvm_amdgcn_buffer_load_f32x4
(
int32x4_t
rsrc
,
__llvm_amdgcn_buffer_load_f32x4
(
int32x4_t
s
rsrc
,
index_t
vindex
,
index_t
vindex
,
index_t
offset
,
index_t
offset
,
bool
glc
,
bool
glc
,
bool
slc
)
__asm
(
"llvm.amdgcn.buffer.load.v4f32"
);
bool
slc
)
__asm
(
"llvm.amdgcn.buffer.load.v4f32"
);
__device__
half_t
__llvm_amdgcn_buffer_load_f16
(
int32x4_t
rsrc
,
__device__
half_t
__llvm_amdgcn_buffer_load_f16
(
int32x4_t
s
rsrc
,
index_t
vindex
,
index_t
vindex
,
index_t
offset
,
index_t
offset
,
bool
glc
,
bool
glc
,
bool
slc
)
__asm
(
"llvm.amdgcn.buffer.load.f16"
);
bool
slc
)
__asm
(
"llvm.amdgcn.buffer.load.f16"
);
__device__
half2_t
__llvm_amdgcn_buffer_load_f16x2
(
int32x4_t
rsrc
,
__device__
half2_t
__llvm_amdgcn_buffer_load_f16x2
(
int32x4_t
s
rsrc
,
index_t
vindex
,
index_t
vindex
,
index_t
offset
,
index_t
offset
,
bool
glc
,
bool
glc
,
bool
slc
)
__asm
(
"llvm.amdgcn.buffer.load.v2f16"
);
bool
slc
)
__asm
(
"llvm.amdgcn.buffer.load.v2f16"
);
__device__
half4_t
__llvm_amdgcn_buffer_load_f16x4
(
int32x4_t
rsrc
,
__device__
half4_t
__llvm_amdgcn_buffer_load_f16x4
(
int32x4_t
s
rsrc
,
index_t
vindex
,
index_t
vindex
,
index_t
offset
,
index_t
offset
,
bool
glc
,
bool
glc
,
bool
slc
)
__asm
(
"llvm.amdgcn.buffer.load.v4f16"
);
bool
slc
)
__asm
(
"llvm.amdgcn.buffer.load.v4f16"
);
__device__
ushort
__llvm_amdgcn_buffer_load_bf16
(
int32x4_t
rsrc
,
__device__
ushort
__llvm_amdgcn_buffer_load_bf16
(
int32x4_t
s
rsrc
,
index_t
vindex
,
index_t
vindex
,
index_t
offset
,
index_t
offset
,
bool
glc
,
bool
glc
,
bool
slc
)
__asm
(
"llvm.amdgcn.buffer.load.bf16"
);
bool
slc
)
__asm
(
"llvm.amdgcn.buffer.load.bf16"
);
__device__
ushort2_t
__device__
ushort2_t
__llvm_amdgcn_buffer_load_bf16x2
(
int32x4_t
rsrc
,
__llvm_amdgcn_buffer_load_bf16x2
(
int32x4_t
s
rsrc
,
index_t
vindex
,
index_t
vindex
,
index_t
offset
,
index_t
offset
,
bool
glc
,
bool
glc
,
bool
slc
)
__asm
(
"llvm.amdgcn.buffer.load.v2bf16"
);
bool
slc
)
__asm
(
"llvm.amdgcn.buffer.load.v2bf16"
);
__device__
ushort4_t
__device__
ushort4_t
__llvm_amdgcn_buffer_load_bf16x4
(
int32x4_t
rsrc
,
__llvm_amdgcn_buffer_load_bf16x4
(
int32x4_t
s
rsrc
,
index_t
vindex
,
index_t
vindex
,
index_t
offset
,
index_t
offset
,
bool
glc
,
bool
glc
,
bool
slc
)
__asm
(
"llvm.amdgcn.buffer.load.v4bf16"
);
bool
slc
)
__asm
(
"llvm.amdgcn.buffer.load.v4bf16"
);
__device__
void
__llvm_amdgcn_buffer_store_f32
(
float
vdata
,
__device__
void
__llvm_amdgcn_buffer_store_f32
(
float
vdata
,
int32x4_t
rsrc
,
int32x4_t
s
rsrc
,
index_t
vindex
,
index_t
vindex
,
index_t
offset
,
index_t
offset
,
bool
glc
,
bool
glc
,
bool
slc
)
__asm
(
"llvm.amdgcn.buffer.store.f32"
);
bool
slc
)
__asm
(
"llvm.amdgcn.buffer.store.f32"
);
__device__
void
__llvm_amdgcn_buffer_store_f32x2
(
float2_t
vdata
,
__device__
void
__llvm_amdgcn_buffer_store_f32x2
(
float2_t
vdata
,
int32x4_t
rsrc
,
int32x4_t
s
rsrc
,
index_t
vindex
,
index_t
vindex
,
index_t
offset
,
index_t
offset
,
bool
glc
,
bool
glc
,
bool
slc
)
__asm
(
"llvm.amdgcn.buffer.store.v2f32"
);
bool
slc
)
__asm
(
"llvm.amdgcn.buffer.store.v2f32"
);
__device__
void
__llvm_amdgcn_buffer_store_f32x4
(
float4_t
vdata
,
__device__
void
__llvm_amdgcn_buffer_store_f32x4
(
float4_t
vdata
,
int32x4_t
rsrc
,
int32x4_t
s
rsrc
,
index_t
vindex
,
index_t
vindex
,
index_t
offset
,
index_t
offset
,
bool
glc
,
bool
glc
,
bool
slc
)
__asm
(
"llvm.amdgcn.buffer.store.v4f32"
);
bool
slc
)
__asm
(
"llvm.amdgcn.buffer.store.v4f32"
);
__device__
void
__llvm_amdgcn_buffer_store_f16
(
half_t
vdata
,
__device__
void
__llvm_amdgcn_buffer_store_f16
(
half_t
vdata
,
int32x4_t
rsrc
,
int32x4_t
s
rsrc
,
index_t
vindex
,
index_t
vindex
,
index_t
offset
,
index_t
offset
,
bool
glc
,
bool
glc
,
bool
slc
)
__asm
(
"llvm.amdgcn.buffer.store.f16"
);
bool
slc
)
__asm
(
"llvm.amdgcn.buffer.store.f16"
);
__device__
void
__llvm_amdgcn_buffer_store_f16x2
(
half2_t
vdata
,
__device__
void
__llvm_amdgcn_buffer_store_f16x2
(
half2_t
vdata
,
int32x4_t
rsrc
,
int32x4_t
s
rsrc
,
index_t
vindex
,
index_t
vindex
,
index_t
offset
,
index_t
offset
,
bool
glc
,
bool
glc
,
bool
slc
)
__asm
(
"llvm.amdgcn.buffer.store.v2f16"
);
bool
slc
)
__asm
(
"llvm.amdgcn.buffer.store.v2f16"
);
__device__
void
__llvm_amdgcn_buffer_store_f16x4
(
half4_t
vdata
,
__device__
void
__llvm_amdgcn_buffer_store_f16x4
(
half4_t
vdata
,
int32x4_t
rsrc
,
int32x4_t
s
rsrc
,
index_t
vindex
,
index_t
vindex
,
index_t
offset
,
index_t
offset
,
bool
glc
,
bool
glc
,
bool
slc
)
__asm
(
"llvm.amdgcn.buffer.store.v4f16"
);
bool
slc
)
__asm
(
"llvm.amdgcn.buffer.store.v4f16"
);
__device__
void
__llvm_amdgcn_buffer_store_bf16
(
ushort
vdata
,
__device__
void
__llvm_amdgcn_buffer_store_bf16
(
ushort
vdata
,
int32x4_t
rsrc
,
int32x4_t
s
rsrc
,
index_t
vindex
,
index_t
vindex
,
index_t
offset
,
index_t
offset
,
bool
glc
,
bool
glc
,
...
@@ -124,7 +124,7 @@ __device__ void __llvm_amdgcn_buffer_store_bf16(ushort vdata,
...
@@ -124,7 +124,7 @@ __device__ void __llvm_amdgcn_buffer_store_bf16(ushort vdata,
__device__
void
__device__
void
__llvm_amdgcn_buffer_store_bf16x2
(
ushort2_t
vdata
,
__llvm_amdgcn_buffer_store_bf16x2
(
ushort2_t
vdata
,
int32x4_t
rsrc
,
int32x4_t
s
rsrc
,
index_t
vindex
,
index_t
vindex
,
index_t
offset
,
index_t
offset
,
bool
glc
,
bool
glc
,
...
@@ -132,7 +132,7 @@ __llvm_amdgcn_buffer_store_bf16x2(ushort2_t vdata,
...
@@ -132,7 +132,7 @@ __llvm_amdgcn_buffer_store_bf16x2(ushort2_t vdata,
__device__
void
__device__
void
__llvm_amdgcn_buffer_store_bf16x4
(
ushort4_t
vdata
,
__llvm_amdgcn_buffer_store_bf16x4
(
ushort4_t
vdata
,
int32x4_t
rsrc
,
int32x4_t
s
rsrc
,
index_t
vindex
,
index_t
vindex
,
index_t
offset
,
index_t
offset
,
bool
glc
,
bool
glc
,
...
@@ -140,781 +140,1083 @@ __llvm_amdgcn_buffer_store_bf16x4(ushort4_t vdata,
...
@@ -140,781 +140,1083 @@ __llvm_amdgcn_buffer_store_bf16x4(ushort4_t vdata,
__device__
void
__device__
void
__llvm_amdgcn_buffer_atomic_add_f32
(
float
vdata
,
__llvm_amdgcn_buffer_atomic_add_f32
(
float
vdata
,
int32x4_t
rsrc
,
int32x4_t
s
rsrc
,
index_t
vindex
,
index_t
vindex
,
index_t
offset
,
index_t
offset
,
bool
slc
)
__asm
(
"llvm.amdgcn.buffer.atomic.fadd.f32"
);
bool
slc
)
__asm
(
"llvm.amdgcn.buffer.atomic.fadd.f32"
);
// buffer_load requires:
// buffer_load requires:
// 1) p_src must be in global memory space,
d
_dst must be vgpr
// 1) p_src
_thread
must be in global memory space,
p
_dst
_thread
must be vgpr
// 2) p_src to be a
block-invariant
pointer.
// 2) p_src
_thread
to be a
wavewise
pointer.
// It is user's responsibility to make sure that is true.
// It is user's responsibility to make sure that is true.
template
<
typename
T
,
index_t
VectorSize
>
template
<
typename
T
,
index_t
VectorSize
>
__device__
typename
vector_type
<
T
,
VectorSize
>::
MemoryType
__device__
typename
vector_type
<
T
,
VectorSize
>::
MemoryType
amd_buffer_load
(
const
T
*
p_src_
block
,
amd_buffer_load
(
const
T
*
p_src_
wave
,
index_t
src_thread_data_offset
,
index_t
src_thread_data_offset
,
index_t
src_const_data_offset
,
index_t
src_const_data_offset
,
bool
src_valid
);
bool
src_data_valid
,
index_t
src_elemenst_space
);
// buffer_store requires:
// buffer_store requires:
// 1) p_src must be in vgpr space,
d
_dst must be global memory
// 1) p_src
_thread
must be in vgpr space,
p
_dst
_thread
must be global memory
// 2) p_dst to be a
block-invariant
pointer.
// 2) p_dst
_thread
to be a
wavewise
pointer.
// It is user's responsibility to make sure that is true.
// It is user's responsibility to make sure that is true.
template
<
typename
T
,
index_t
VectorSize
>
template
<
typename
T
,
index_t
VectorSize
>
__device__
void
amd_buffer_store
(
const
T
*
p_src
,
__device__
void
amd_buffer_store
(
const
T
*
p_src
_thread
,
T
*
p_dst_
block
,
T
*
p_dst_
wave
,
index_t
dst_thread_data_offset
,
index_t
dst_thread_data_offset
,
index_t
dst_const_data_offset
,
index_t
dst_const_data_offset
,
bool
dst_valid
);
bool
dst_data_valid
,
index_t
dst_data_range
);
// buffer_atomic requires:
// 1) p_src_thread must be in vgpr space, p_dst_thread must be global memory
// 2) p_dst_thread to be a wavewise pointer.
// It is user's responsibility to make sure that is true.
template
<
typename
T
,
index_t
VectorSize
>
template
<
typename
T
,
index_t
VectorSize
>
__device__
void
amd_buffer_atomic_add
(
const
T
*
p_src
,
__device__
void
amd_buffer_atomic_add
(
const
T
*
p_src
_thread
,
T
*
p_dst_
block
,
T
*
p_dst_
wave
,
index_t
dst_thread_data_offset
,
index_t
dst_thread_data_offset
,
index_t
dst_const_data_offset
,
index_t
dst_const_data_offset
,
bool
dst_valid
);
bool
dst_data_valid
,
index_t
dst_data_range
);
template
<
>
template
<
>
__device__
float
amd_buffer_load
<
float
,
1
>
(
const
float
*
p_src_
block
,
__device__
float
amd_buffer_load
<
float
,
1
>
(
const
float
*
p_src_
wave
,
index_t
src_thread_data_offset
,
index_t
src_thread_data_offset
,
index_t
src_const_data_offset
,
index_t
src_const_data_offset
,
bool
src_valid
)
bool
src_data_valid
,
index_t
src_data_range
)
{
{
Buffer
AddressConfig
<
float
>
src_
block
_config
;
Buffer
ResourceConstant
<
float
>
src_
wave
_config
;
//
fill in byte 0 - 1
//
wavewise base address (64 bit)
src_
block
_config
.
address
[
0
]
=
const_cast
<
float
*>
(
p_src_
block
);
src_
wave
_config
.
address
[
0
]
=
const_cast
<
float
*>
(
p_src_
wave
);
//
fill in byte 2
//
wavewise range (32 bit)
src_
block
_config
.
range
[
2
]
=
-
1
;
src_
wave
_config
.
range
[
2
]
=
src_data_range
*
sizeof
(
float
)
;
//
fill in byte 3
//
wavewise setting (32 bit)
src_
block
_config
.
range
[
3
]
=
0x00027000
;
src_
wave
_config
.
range
[
3
]
=
0x00027000
;
index_t
src_thread_addr_offset
=
src_thread_data_offset
*
sizeof
(
float
);
index_t
src_thread_addr_offset
=
src_thread_data_offset
*
sizeof
(
float
);
index_t
src_const_addr_offset
=
src_const_data_offset
*
sizeof
(
float
);
index_t
src_const_addr_offset
=
src_const_data_offset
*
sizeof
(
float
);
#if !CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK
return
__llvm_amdgcn_buffer_load_f32
(
return
__llvm_amdgcn_buffer_load_f32
(
src_block_config
.
data
,
src_wave_config
.
data
,
0
,
src_data_valid
?
(
src_thread_addr_offset
+
src_const_addr_offset
)
:
0xffffffff
,
false
,
false
);
#else
index_t
src_addr_base
=
src_data_valid
?
0
:
0x7fffffff
;
return
__llvm_amdgcn_buffer_load_f32
(
src_wave_config
.
data
,
0
,
0
,
src_valid
?
(
src_thread_addr_offset
+
src_const_addr_offset
)
:
-
1
,
src_addr_base
+
src_thread_addr_offset
+
src_const_addr_offset
,
false
,
false
,
false
);
false
);
#endif
}
}
template
<
>
template
<
>
__device__
float2_t
amd_buffer_load
<
float
,
2
>
(
const
float
*
p_src_
block
,
__device__
float2_t
amd_buffer_load
<
float
,
2
>
(
const
float
*
p_src_
wave
,
index_t
src_thread_data_offset
,
index_t
src_thread_data_offset
,
index_t
src_const_data_offset
,
index_t
src_const_data_offset
,
bool
src_valid
)
bool
src_data_valid
,
index_t
src_data_range
)
{
{
Buffer
AddressConfig
<
float
>
src_
block
_config
;
Buffer
ResourceConstant
<
float
>
src_
wave
_config
;
//
fill in byte 0 - 1
//
wavewise base address (64 bit)
src_
block
_config
.
address
[
0
]
=
const_cast
<
float
*>
(
p_src_
block
);
src_
wave
_config
.
address
[
0
]
=
const_cast
<
float
*>
(
p_src_
wave
);
//
fill in byte 2
//
wavewise range (32 bit)
src_
block
_config
.
range
[
2
]
=
-
1
;
src_
wave
_config
.
range
[
2
]
=
src_data_range
*
sizeof
(
float
)
;
//
fill in byte 3
//
wavewise setting (32 bit)
src_
block
_config
.
range
[
3
]
=
0x00027000
;
src_
wave
_config
.
range
[
3
]
=
0x00027000
;
index_t
src_thread_addr_offset
=
src_thread_data_offset
*
sizeof
(
float
);
index_t
src_thread_addr_offset
=
src_thread_data_offset
*
sizeof
(
float
);
index_t
src_const_addr_offset
=
src_const_data_offset
*
sizeof
(
float
);
index_t
src_const_addr_offset
=
src_const_data_offset
*
sizeof
(
float
);
#if !CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK
return
__llvm_amdgcn_buffer_load_f32x2
(
return
__llvm_amdgcn_buffer_load_f32x2
(
src_block_config
.
data
,
src_wave_config
.
data
,
0
,
src_data_valid
?
(
src_thread_addr_offset
+
src_const_addr_offset
)
:
0xffffffff
,
false
,
false
);
#else
index_t
src_addr_base
=
src_data_valid
?
0
:
0x7fffffff
;
return
__llvm_amdgcn_buffer_load_f32x2
(
src_wave_config
.
data
,
0
,
0
,
src_valid
?
(
src_thread_addr_offset
+
src_const_addr_offset
)
:
-
1
,
src_addr_base
+
src_thread_addr_offset
+
src_const_addr_offset
,
false
,
false
,
false
);
false
);
#endif
}
}
template
<
>
template
<
>
__device__
float4_t
amd_buffer_load
<
float
,
4
>
(
const
float
*
p_src_
block
,
__device__
float4_t
amd_buffer_load
<
float
,
4
>
(
const
float
*
p_src_
wave
,
index_t
src_thread_data_offset
,
index_t
src_thread_data_offset
,
index_t
src_const_data_offset
,
index_t
src_const_data_offset
,
bool
src_valid
)
bool
src_data_valid
,
index_t
src_data_range
)
{
{
Buffer
AddressConfig
<
float
>
src_
block
_config
;
Buffer
ResourceConstant
<
float
>
src_
wave
_config
;
//
fill in byte 0 - 1
//
wavewise base address (64 bit)
src_
block
_config
.
address
[
0
]
=
const_cast
<
float
*>
(
p_src_
block
);
src_
wave
_config
.
address
[
0
]
=
const_cast
<
float
*>
(
p_src_
wave
);
//
fill in byte 2
//
wavewise range (32 bit)
src_
block
_config
.
range
[
2
]
=
-
1
;
src_
wave
_config
.
range
[
2
]
=
src_data_range
*
sizeof
(
float
)
;
//
fill in byte 3
//
wavewise setting (32 bit)
src_
block
_config
.
range
[
3
]
=
0x00027000
;
src_
wave
_config
.
range
[
3
]
=
0x00027000
;
index_t
src_thread_addr_offset
=
src_thread_data_offset
*
sizeof
(
float
);
index_t
src_thread_addr_offset
=
src_thread_data_offset
*
sizeof
(
float
);
index_t
src_const_addr_offset
=
src_const_data_offset
*
sizeof
(
float
);
index_t
src_const_addr_offset
=
src_const_data_offset
*
sizeof
(
float
);
#if !CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK
return
__llvm_amdgcn_buffer_load_f32x4
(
return
__llvm_amdgcn_buffer_load_f32x4
(
src_
block
_config
.
data
,
src_
wave
_config
.
data
,
0
,
0
,
src_valid
?
(
src_thread_addr_offset
+
src_const_addr_offset
)
:
-
1
,
src_
data_
valid
?
(
src_thread_addr_offset
+
src_const_addr_offset
)
:
0xffffffff
,
false
,
false
,
false
);
false
);
#else
index_t
src_addr_base
=
src_data_valid
?
0
:
0x7fffffff
;
return
__llvm_amdgcn_buffer_load_f32x4
(
src_wave_config
.
data
,
0
,
src_addr_base
+
src_thread_addr_offset
+
src_const_addr_offset
,
false
,
false
);
#endif
}
}
template
<
>
template
<
>
__device__
half_t
amd_buffer_load
<
half_t
,
1
>
(
const
half_t
*
p_src_
block
,
__device__
half_t
amd_buffer_load
<
half_t
,
1
>
(
const
half_t
*
p_src_
wave
,
index_t
src_thread_data_offset
,
index_t
src_thread_data_offset
,
index_t
src_const_data_offset
,
index_t
src_const_data_offset
,
bool
src_valid
)
bool
src_data_valid
,
index_t
src_data_range
)
{
{
Buffer
AddressConfig
<
half_t
>
src_
block
_config
;
Buffer
ResourceConstant
<
half_t
>
src_
wave
_config
;
//
fill in byte 0 - 1
//
wavewise base address (64 bit)
src_
block
_config
.
address
[
0
]
=
const_cast
<
half_t
*>
(
p_src_
block
);
src_
wave
_config
.
address
[
0
]
=
const_cast
<
half_t
*>
(
p_src_
wave
);
//
fill in byte 2
//
wavewise range (32 bit)
src_
block
_config
.
range
[
2
]
=
-
1
;
src_
wave
_config
.
range
[
2
]
=
src_data_range
*
sizeof
(
half_t
)
;
//
fill in byte 3
//
wavewise setting (32 bit)
src_
block
_config
.
range
[
3
]
=
0x00027000
;
src_
wave
_config
.
range
[
3
]
=
0x00027000
;
#if !CK_WORKAROUND_SWDEV_231101
#if !CK_WORKAROUND_SWDEV_231101
index_t
src_thread_addr_offset
=
src_thread_data_offset
*
sizeof
(
half_t
);
index_t
src_thread_addr_offset
=
src_thread_data_offset
*
sizeof
(
half_t
);
index_t
src_const_addr_offset
=
src_const_data_offset
*
sizeof
(
half_t
);
index_t
src_const_addr_offset
=
src_const_data_offset
*
sizeof
(
half_t
);
#if !CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK
return
__llvm_amdgcn_buffer_load_f16
(
return
__llvm_amdgcn_buffer_load_f16
(
src_block_config
.
data
,
src_wave_config
.
data
,
0
,
src_data_valid
?
(
src_thread_addr_offset
+
src_const_addr_offset
)
:
0xffffffff
,
false
,
false
);
#else
index_t
src_addr_base
=
src_data_valid
?
0
:
0x7fffffff
;
return
__llvm_amdgcn_buffer_load_f16
(
src_wave_config
.
data
,
0
,
0
,
src_valid
?
(
src_thread_addr_offset
+
src_const_addr_offset
)
:
-
1
,
src_addr_base
+
src_thread_addr_offset
+
src_const_addr_offset
,
false
,
false
,
false
);
false
);
#endif
#else
#else
return
src_valid
?
p_src_
block
[
src_thread_data_offset
+
src_const_data_offset
]
:
0
;
return
src_
data_
valid
?
p_src_
wave
[
src_thread_data_offset
+
src_const_data_offset
]
:
0
;
#endif
#endif
}
}
template
<
>
template
<
>
__device__
half2_t
amd_buffer_load
<
half_t
,
2
>
(
const
half_t
*
p_src_
block
,
__device__
half2_t
amd_buffer_load
<
half_t
,
2
>
(
const
half_t
*
p_src_
wave
,
index_t
src_thread_data_offset
,
index_t
src_thread_data_offset
,
index_t
src_const_data_offset
,
index_t
src_const_data_offset
,
bool
src_valid
)
bool
src_data_valid
,
index_t
src_data_range
)
{
{
Buffer
AddressConfig
<
half_t
>
src_
block
_config
;
Buffer
ResourceConstant
<
half_t
>
src_
wave
_config
;
//
fill in byte 0 - 1
//
wavewise base address (64 bit)
src_
block
_config
.
address
[
0
]
=
const_cast
<
half_t
*>
(
p_src_
block
);
src_
wave
_config
.
address
[
0
]
=
const_cast
<
half_t
*>
(
p_src_
wave
);
//
fill in byte 2
//
wavewise range (32 bit)
src_
block
_config
.
range
[
2
]
=
-
1
;
src_
wave
_config
.
range
[
2
]
=
src_data_range
*
sizeof
(
half_t
)
;
//
fill in byte 3
//
wavewise setting (32 bit)
src_
block
_config
.
range
[
3
]
=
0x00027000
;
src_
wave
_config
.
range
[
3
]
=
0x00027000
;
index_t
src_thread_addr_offset
=
src_thread_data_offset
*
sizeof
(
half_t
);
index_t
src_thread_addr_offset
=
src_thread_data_offset
*
sizeof
(
half_t
);
index_t
src_const_addr_offset
=
src_const_data_offset
*
sizeof
(
half_t
);
index_t
src_const_addr_offset
=
src_const_data_offset
*
sizeof
(
half_t
);
#if !CK_
WORKAROUND_SWDEV_231101
#if !CK_
EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK
return
__llvm_amdgcn_buffer_load_f
16x
2
(
float
dst_out_tmp
=
__llvm_amdgcn_buffer_load_f
3
2
(
src_
block
_config
.
data
,
src_
wave
_config
.
data
,
0
,
0
,
src_valid
?
(
src_thread_addr_offset
+
src_const_addr_offset
)
:
-
1
,
src_
data_
valid
?
(
src_thread_addr_offset
+
src_const_addr_offset
)
:
0xffffffff
,
false
,
false
,
false
);
false
);
#else
#else
float
dst_out_tmp
=
__llvm_amdgcn_buffer_load_f32
(
index_t
src_addr_base
=
src_data_valid
?
0
:
0x7fffffff
;
src_block_config
.
data
,
float
dst_out_tmp
=
__llvm_amdgcn_buffer_load_f32
(
src_wave_config
.
data
,
0
,
0
,
src_valid
?
(
src_thread_addr_offset
+
src_const_addr_offset
)
:
-
1
,
src_addr_base
+
src_thread_addr_offset
+
src_const_addr_offset
,
false
,
false
,
false
);
false
);
#endif
return
*
reinterpret_cast
<
half2_t
*>
(
&
dst_out_tmp
);
return
*
reinterpret_cast
<
half2_t
*>
(
&
dst_out_tmp
);
#endif
}
}
template
<
>
template
<
>
__device__
half4_t
amd_buffer_load
<
half_t
,
4
>
(
const
half_t
*
p_src_
block
,
__device__
half4_t
amd_buffer_load
<
half_t
,
4
>
(
const
half_t
*
p_src_
wave
,
index_t
src_thread_data_offset
,
index_t
src_thread_data_offset
,
index_t
src_const_data_offset
,
index_t
src_const_data_offset
,
bool
src_valid
)
bool
src_data_valid
,
index_t
src_data_range
)
{
{
Buffer
AddressConfig
<
half_t
>
src_
block
_config
;
Buffer
ResourceConstant
<
half_t
>
src_
wave
_config
;
//
fill in byte 0 - 1
//
wavewise base address (64 bit)
src_
block
_config
.
address
[
0
]
=
const_cast
<
half_t
*>
(
p_src_
block
);
src_
wave
_config
.
address
[
0
]
=
const_cast
<
half_t
*>
(
p_src_
wave
);
//
fill in byte 2
//
wavewise range (32 bit)
src_
block
_config
.
range
[
2
]
=
-
1
;
src_
wave
_config
.
range
[
2
]
=
src_data_range
*
sizeof
(
half_t
)
;
//
fill in byte 3
//
wavewise setting (32 bit)
src_
block
_config
.
range
[
3
]
=
0x00027000
;
src_
wave
_config
.
range
[
3
]
=
0x00027000
;
index_t
src_thread_addr_offset
=
src_thread_data_offset
*
sizeof
(
half_t
);
index_t
src_thread_addr_offset
=
src_thread_data_offset
*
sizeof
(
half_t
);
index_t
src_const_addr_offset
=
src_const_data_offset
*
sizeof
(
half_t
);
index_t
src_const_addr_offset
=
src_const_data_offset
*
sizeof
(
half_t
);
#if !CK_
WORKAROUND_SWDEV_231101
#if !CK_
EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK
return
__llvm_amdgcn_buffer_load_f
16x4
(
float2_t
dst_out_tmp
=
__llvm_amdgcn_buffer_load_f
32x2
(
src_
block
_config
.
data
,
src_
wave
_config
.
data
,
0
,
0
,
src_valid
?
(
src_thread_addr_offset
+
src_const_addr_offset
)
:
-
1
,
src_
data_
valid
?
(
src_thread_addr_offset
+
src_const_addr_offset
)
:
0xffffffff
,
false
,
false
,
false
);
false
);
#else
#else
float2_t
dst_out_tmp
=
__llvm_amdgcn_buffer_load_f32x2
(
index_t
src_addr_base
=
src_data_valid
?
0
:
0x7fffffff
;
src_block_config
.
data
,
float2_t
dst_out_tmp
=
__llvm_amdgcn_buffer_load_f32x2
(
src_wave_config
.
data
,
0
,
0
,
src_valid
?
(
src_thread_addr_offset
+
src_const_addr_offset
)
:
-
1
,
src_addr_base
+
src_thread_addr_offset
+
src_const_addr_offset
,
false
,
false
,
false
);
false
);
#endif
return
*
reinterpret_cast
<
half4_t
*>
(
&
dst_out_tmp
);
return
*
reinterpret_cast
<
half4_t
*>
(
&
dst_out_tmp
);
#endif
}
}
template
<
>
template
<
>
__device__
half8_t
amd_buffer_load
<
half_t
,
8
>
(
const
half_t
*
p_src_
block
,
__device__
half8_t
amd_buffer_load
<
half_t
,
8
>
(
const
half_t
*
p_src_
wave
,
index_t
src_thread_data_offset
,
index_t
src_thread_data_offset
,
index_t
src_const_data_offset
,
index_t
src_const_data_offset
,
bool
src_valid
)
bool
src_data_valid
,
index_t
src_data_range
)
{
{
Buffer
AddressConfig
<
half_t
>
src_
block
_config
;
Buffer
ResourceConstant
<
half_t
>
src_
wave
_config
;
//
fill in byte 0 - 1
//
wavewise base address (64 bit)
src_
block
_config
.
address
[
0
]
=
const_cast
<
half_t
*>
(
p_src_
block
);
src_
wave
_config
.
address
[
0
]
=
const_cast
<
half_t
*>
(
p_src_
wave
);
//
fill in byte 2
//
wavewise range (32 bit)
src_
block
_config
.
range
[
2
]
=
-
1
;
src_
wave
_config
.
range
[
2
]
=
src_data_range
*
sizeof
(
half_t
)
;
//
fill in byte 3
//
wavewise setting (32 bit)
src_
block
_config
.
range
[
3
]
=
0x00027000
;
src_
wave
_config
.
range
[
3
]
=
0x00027000
;
index_t
src_thread_addr_offset
=
src_thread_data_offset
*
sizeof
(
half_t
);
index_t
src_thread_addr_offset
=
src_thread_data_offset
*
sizeof
(
half_t
);
index_t
src_const_addr_offset
=
src_const_data_offset
*
sizeof
(
half_t
);
index_t
src_const_addr_offset
=
src_const_data_offset
*
sizeof
(
half_t
);
#if !CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK
float4_t
dst_out_tmp
=
__llvm_amdgcn_buffer_load_f32x4
(
float4_t
dst_out_tmp
=
__llvm_amdgcn_buffer_load_f32x4
(
src_
block
_config
.
data
,
src_
wave
_config
.
data
,
0
,
0
,
src_valid
?
(
src_thread_addr_offset
+
src_const_addr_offset
)
:
-
1
,
src_
data_
valid
?
(
src_thread_addr_offset
+
src_const_addr_offset
)
:
0xffffffff
,
false
,
false
,
false
);
false
);
#else
index_t
src_addr_base
=
src_data_valid
?
0
:
0x7fffffff
;
float4_t
dst_out_tmp
=
__llvm_amdgcn_buffer_load_f32x4
(
src_wave_config
.
data
,
0
,
src_addr_base
+
src_thread_addr_offset
+
src_const_addr_offset
,
false
,
false
);
#endif
return
*
reinterpret_cast
<
half8_t
*>
(
&
dst_out_tmp
);
return
*
reinterpret_cast
<
half8_t
*>
(
&
dst_out_tmp
);
}
}
template
<
>
template
<
>
__device__
ushort
amd_buffer_load
<
ushort
,
1
>
(
const
ushort
*
p_src_
block
,
__device__
ushort
amd_buffer_load
<
ushort
,
1
>
(
const
ushort
*
p_src_
wave
,
index_t
src_thread_data_offset
,
index_t
src_thread_data_offset
,
index_t
src_const_data_offset
,
index_t
src_const_data_offset
,
bool
src_valid
)
bool
src_data_valid
,
index_t
src_data_range
)
{
{
Buffer
AddressConfig
<
ushort
>
src_
block
_config
;
Buffer
ResourceConstant
<
ushort
>
src_
wave
_config
;
//
fill in byte 0 - 1
//
wavewise base address (64 bit)
src_
block
_config
.
address
[
0
]
=
const_cast
<
ushort
*>
(
p_src_
block
);
src_
wave
_config
.
address
[
0
]
=
const_cast
<
ushort
*>
(
p_src_
wave
);
//
fill in byte 2
//
wavewise range (32 bit)
src_
block
_config
.
range
[
2
]
=
-
1
;
src_
wave
_config
.
range
[
2
]
=
src_data_range
*
sizeof
(
ushort
)
;
//
fill in byte 3
//
wavewise setting (32 bit)
src_
block
_config
.
range
[
3
]
=
0x00027000
;
src_
wave
_config
.
range
[
3
]
=
0x00027000
;
#if !CK_WORKAROUND_SWDEV_231101
#if !CK_WORKAROUND_SWDEV_231101
index_t
src_thread_addr_offset
=
src_thread_data_offset
*
sizeof
(
ushort
);
index_t
src_thread_addr_offset
=
src_thread_data_offset
*
sizeof
(
ushort
);
index_t
src_const_addr_offset
=
src_const_data_offset
*
sizeof
(
ushort
);
index_t
src_const_addr_offset
=
src_const_data_offset
*
sizeof
(
ushort
);
#if !CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK
return
__llvm_amdgcn_buffer_load_bf16
(
return
__llvm_amdgcn_buffer_load_bf16
(
src_
block
_config
.
data
,
src_
wave
_config
.
data
,
0
,
0
,
src_valid
?
(
src_thread_addr_offset
+
src_const_addr_offset
)
:
-
1
,
src_
data_
valid
?
(
src_thread_addr_offset
+
src_const_addr_offset
)
:
0xffffffff
,
false
,
false
,
false
);
false
);
#else
#else
return
src_valid
?
p_src_block
[
src_thread_data_offset
+
src_const_data_offset
]
:
0
;
index_t
src_addr_base
=
src_data_valid
?
0
:
0x7fffffff
;
return
__llvm_amdgcn_buffer_load_bf16
(
src_wave_config
.
data
,
0
,
src_addr_base
+
src_thread_addr_offset
+
src_const_addr_offset
,
false
,
false
);
#endif
#else
return
src_data_valid
?
p_src_wave
[
src_thread_data_offset
+
src_const_data_offset
]
:
0
;
#endif
#endif
}
}
template
<
>
template
<
>
__device__
ushort2_t
amd_buffer_load
<
ushort
,
2
>
(
const
ushort
*
p_src_
block
,
__device__
ushort2_t
amd_buffer_load
<
ushort
,
2
>
(
const
ushort
*
p_src_
wave
,
index_t
src_thread_data_offset
,
index_t
src_thread_data_offset
,
index_t
src_const_data_offset
,
index_t
src_const_data_offset
,
bool
src_valid
)
bool
src_data_valid
,
index_t
src_data_range
)
{
{
Buffer
AddressConfig
<
ushort
>
src_
block
_config
;
Buffer
ResourceConstant
<
ushort
>
src_
wave
_config
;
//
fill in byte 0 - 1
//
wavewise base address (64 bit)
src_
block
_config
.
address
[
0
]
=
const_cast
<
ushort
*>
(
p_src_
block
);
src_
wave
_config
.
address
[
0
]
=
const_cast
<
ushort
*>
(
p_src_
wave
);
//
fill in byte 2
//
wavewise range (32 bit)
src_
block
_config
.
range
[
2
]
=
-
1
;
src_
wave
_config
.
range
[
2
]
=
src_data_range
*
sizeof
(
ushort
)
;
//
fill in byte 3
//
wavewise setting (32 bit)
src_
block
_config
.
range
[
3
]
=
0x00027000
;
src_
wave
_config
.
range
[
3
]
=
0x00027000
;
index_t
src_thread_addr_offset
=
src_thread_data_offset
*
sizeof
(
ushort
);
index_t
src_thread_addr_offset
=
src_thread_data_offset
*
sizeof
(
ushort
);
index_t
src_const_addr_offset
=
src_const_data_offset
*
sizeof
(
ushort
);
index_t
src_const_addr_offset
=
src_const_data_offset
*
sizeof
(
ushort
);
#if !CK_
WORKAROUND_SWDEV_231101
#if !CK_
EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK
return
__llvm_amdgcn_buffer_load_
bf16x
2
(
float
dst_out_tmp
=
__llvm_amdgcn_buffer_load_
f3
2
(
src_
block
_config
.
data
,
src_
wave
_config
.
data
,
0
,
0
,
src_valid
?
(
src_thread_addr_offset
+
src_const_addr_offset
)
:
-
1
,
src_
data_
valid
?
(
src_thread_addr_offset
+
src_const_addr_offset
)
:
0xffffffff
,
false
,
false
,
false
);
false
);
#else
#else
float
dst_out_tmp
=
__llvm_amdgcn_buffer_load_f32
(
index_t
src_addr_base
=
src_data_valid
?
0
:
0x7fffffff
;
src_block_config
.
data
,
float
dst_out_tmp
=
__llvm_amdgcn_buffer_load_f32
(
src_wave_config
.
data
,
0
,
0
,
src_valid
?
(
src_thread_addr_offset
+
src_const_addr_offset
)
:
-
1
,
src_addr_base
+
src_thread_addr_offset
+
src_const_addr_offset
,
false
,
false
,
false
);
false
);
#endif
return
*
reinterpret_cast
<
ushort2_t
*>
(
&
dst_out_tmp
);
return
*
reinterpret_cast
<
ushort2_t
*>
(
&
dst_out_tmp
);
#endif
}
}
template
<
>
template
<
>
__device__
ushort4_t
amd_buffer_load
<
ushort
,
4
>
(
const
ushort
*
p_src_
block
,
__device__
ushort4_t
amd_buffer_load
<
ushort
,
4
>
(
const
ushort
*
p_src_
wave
,
index_t
src_thread_data_offset
,
index_t
src_thread_data_offset
,
index_t
src_const_data_offset
,
index_t
src_const_data_offset
,
bool
src_valid
)
bool
src_data_valid
,
index_t
src_data_range
)
{
{
Buffer
AddressConfig
<
ushort
>
src_
block
_config
;
Buffer
ResourceConstant
<
ushort
>
src_
wave
_config
;
//
fill in byte 0 - 1
//
wavewise base address (64 bit)
src_
block
_config
.
address
[
0
]
=
const_cast
<
ushort
*>
(
p_src_
block
);
src_
wave
_config
.
address
[
0
]
=
const_cast
<
ushort
*>
(
p_src_
wave
);
//
fill in byte 2
//
wavewise range (32 bit)
src_
block
_config
.
range
[
2
]
=
-
1
;
src_
wave
_config
.
range
[
2
]
=
src_data_range
*
sizeof
(
ushort
)
;
//
fill in byte 3
//
wavewise setting (32 bit)
src_
block
_config
.
range
[
3
]
=
0x00027000
;
src_
wave
_config
.
range
[
3
]
=
0x00027000
;
index_t
src_thread_addr_offset
=
src_thread_data_offset
*
sizeof
(
ushort
);
index_t
src_thread_addr_offset
=
src_thread_data_offset
*
sizeof
(
ushort
);
index_t
src_const_addr_offset
=
src_const_data_offset
*
sizeof
(
ushort
);
index_t
src_const_addr_offset
=
src_const_data_offset
*
sizeof
(
ushort
);
#if !CK_
WORKAROUND_SWDEV_231101
#if !CK_
EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK
return
__llvm_amdgcn_buffer_load_
bf16x4
(
float2_t
dst_out_tmp
=
__llvm_amdgcn_buffer_load_
f32x2
(
src_
block
_config
.
data
,
src_
wave
_config
.
data
,
0
,
0
,
src_valid
?
(
src_thread_addr_offset
+
src_const_addr_offset
)
:
-
1
,
src_
data_
valid
?
(
src_thread_addr_offset
+
src_const_addr_offset
)
:
0xffffffff
,
false
,
false
,
false
);
false
);
#else
#else
float2_t
dst_out_tmp
=
__llvm_amdgcn_buffer_load_f32x2
(
index_t
src_addr_base
=
src_data_valid
?
0
:
0x7fffffff
;
src_block_config
.
data
,
float2_t
dst_out_tmp
=
__llvm_amdgcn_buffer_load_f32x2
(
src_wave_config
.
data
,
0
,
0
,
src_valid
?
(
src_thread_addr_offset
+
src_const_addr_offset
)
:
-
1
,
src_addr_base
+
src_thread_addr_offset
+
src_const_addr_offset
,
false
,
false
,
false
);
false
);
#endif
return
*
reinterpret_cast
<
ushort4_t
*>
(
&
dst_out_tmp
);
return
*
reinterpret_cast
<
ushort4_t
*>
(
&
dst_out_tmp
);
#endif
}
}
template
<
>
template
<
>
__device__
ushort8_t
amd_buffer_load
<
ushort
,
8
>
(
const
ushort
*
p_src_
block
,
__device__
ushort8_t
amd_buffer_load
<
ushort
,
8
>
(
const
ushort
*
p_src_
wave
,
index_t
src_thread_data_offset
,
index_t
src_thread_data_offset
,
index_t
src_const_data_offset
,
index_t
src_const_data_offset
,
bool
src_valid
)
bool
src_data_valid
,
index_t
src_data_range
)
{
{
Buffer
AddressConfig
<
ushort
>
src_
block
_config
;
Buffer
ResourceConstant
<
ushort
>
src_
wave
_config
;
//
fill in byte 0 - 1
//
wavewise base address (64 bit)
src_
block
_config
.
address
[
0
]
=
const_cast
<
ushort
*>
(
p_src_
block
);
src_
wave
_config
.
address
[
0
]
=
const_cast
<
ushort
*>
(
p_src_
wave
);
//
fill in byte 2
//
wavewise range (32 bit)
src_
block
_config
.
range
[
2
]
=
-
1
;
src_
wave
_config
.
range
[
2
]
=
src_data_range
*
sizeof
(
ushort
)
;
//
fill in byte 3
//
wavewise setting (32 bit)
src_
block
_config
.
range
[
3
]
=
0x00027000
;
src_
wave
_config
.
range
[
3
]
=
0x00027000
;
index_t
src_thread_addr_offset
=
src_thread_data_offset
*
sizeof
(
ushort
);
index_t
src_thread_addr_offset
=
src_thread_data_offset
*
sizeof
(
ushort
);
index_t
src_const_addr_offset
=
src_const_data_offset
*
sizeof
(
ushort
);
index_t
src_const_addr_offset
=
src_const_data_offset
*
sizeof
(
ushort
);
#if !CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK
float4_t
dst_out_tmp
=
__llvm_amdgcn_buffer_load_f32x4
(
float4_t
dst_out_tmp
=
__llvm_amdgcn_buffer_load_f32x4
(
src_
block
_config
.
data
,
src_
wave
_config
.
data
,
0
,
0
,
src_valid
?
(
src_thread_addr_offset
+
src_const_addr_offset
)
:
-
1
,
src_
data_
valid
?
(
src_thread_addr_offset
+
src_const_addr_offset
)
:
0xffffffff
,
false
,
false
,
false
);
false
);
#else
index_t
src_addr_base
=
src_data_valid
?
0
:
0x7fffffff
;
float4_t
dst_out_tmp
=
__llvm_amdgcn_buffer_load_f32x4
(
src_wave_config
.
data
,
0
,
src_addr_base
+
src_thread_addr_offset
+
src_const_addr_offset
,
false
,
false
);
#endif
return
*
reinterpret_cast
<
ushort8_t
*>
(
&
dst_out_tmp
);
return
*
reinterpret_cast
<
ushort8_t
*>
(
&
dst_out_tmp
);
}
}
template
<
>
template
<
>
__device__
void
amd_buffer_store
<
float
,
1
>
(
const
float
*
p_src
,
__device__
void
amd_buffer_store
<
float
,
1
>
(
const
float
*
p_src
_thread
,
float
*
p_dst_
block
,
float
*
p_dst_
wave
,
index_t
dst_thread_data_offset
,
index_t
dst_thread_data_offset
,
index_t
dst_const_data_offset
,
index_t
dst_const_data_offset
,
bool
dst_valid
)
bool
dst_data_valid
,
index_t
dst_data_range
)
{
{
Buffer
AddressConfig
<
float
>
dst_
block
_config
;
Buffer
ResourceConstant
<
float
>
dst_
wave
_config
;
//
fill in byte 0 - 1
//
wavewise base address (64 bit)
dst_
block
_config
.
address
[
0
]
=
p_dst_
block
;
dst_
wave
_config
.
address
[
0
]
=
p_dst_
wave
;
//
fill in byte 2
//
wavewise range (32 bit)
dst_
block
_config
.
range
[
2
]
=
-
1
;
dst_
wave
_config
.
range
[
2
]
=
dst_data_range
*
sizeof
(
float
)
;
//
fill in byte 3
//
wavewise setting (32 bit)
dst_
block
_config
.
range
[
3
]
=
0x00027000
;
dst_
wave
_config
.
range
[
3
]
=
0x00027000
;
index_t
dst_thread_addr_offset
=
dst_thread_data_offset
*
sizeof
(
float
);
index_t
dst_thread_addr_offset
=
dst_thread_data_offset
*
sizeof
(
float
);
index_t
dst_const_addr_offset
=
dst_const_data_offset
*
sizeof
(
float
);
index_t
dst_const_addr_offset
=
dst_const_data_offset
*
sizeof
(
float
);
__llvm_amdgcn_buffer_store_f32
(
*
p_src
,
#if !CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK
dst_block_config
.
data
,
__llvm_amdgcn_buffer_store_f32
(
*
p_src_thread
,
dst_wave_config
.
data
,
0
,
0
,
dst_valid
?
(
dst_thread_addr_offset
+
dst_const_addr_offset
)
dst_
data_
valid
?
(
dst_thread_addr_offset
+
dst_const_addr_offset
)
:
-
1
,
:
0xffffffff
,
false
,
false
,
false
);
false
);
#else
index_t
dst_addr_base
=
dst_data_valid
?
0
:
0x7fffffff
;
__llvm_amdgcn_buffer_store_f32
(
*
p_src_thread
,
dst_wave_config
.
data
,
0
,
dst_addr_base
+
dst_thread_addr_offset
+
dst_const_addr_offset
,
false
,
false
);
#endif
}
}
template
<
>
template
<
>
__device__
void
amd_buffer_store
<
float
,
2
>
(
const
float
*
p_src
,
__device__
void
amd_buffer_store
<
float
,
2
>
(
const
float
*
p_src
_thread
,
float
*
p_dst_
block
,
float
*
p_dst_
wave
,
index_t
dst_thread_data_offset
,
index_t
dst_thread_data_offset
,
index_t
dst_const_data_offset
,
index_t
dst_const_data_offset
,
bool
dst_valid
)
bool
dst_data_valid
,
index_t
dst_data_range
)
{
{
Buffer
AddressConfig
<
float
>
dst_
block
_config
;
Buffer
ResourceConstant
<
float
>
dst_
wave
_config
;
//
fill in byte 0 - 1
//
wavewise base address (64 bit)
dst_
block
_config
.
address
[
0
]
=
p_dst_
block
;
dst_
wave
_config
.
address
[
0
]
=
p_dst_
wave
;
//
fill in byte 2
//
wavewise range (32 bit)
dst_
block
_config
.
range
[
2
]
=
-
1
;
dst_
wave
_config
.
range
[
2
]
=
dst_data_range
*
sizeof
(
float
)
;
//
fill in byte 3
//
wavewise setting (32 bit)
dst_
block
_config
.
range
[
3
]
=
0x00027000
;
dst_
wave
_config
.
range
[
3
]
=
0x00027000
;
index_t
dst_thread_addr_offset
=
dst_thread_data_offset
*
sizeof
(
float
);
index_t
dst_thread_addr_offset
=
dst_thread_data_offset
*
sizeof
(
float
);
index_t
dst_const_addr_offset
=
dst_const_data_offset
*
sizeof
(
float
);
index_t
dst_const_addr_offset
=
dst_const_data_offset
*
sizeof
(
float
);
__llvm_amdgcn_buffer_store_f32x2
(
*
reinterpret_cast
<
const
float2_t
*>
(
p_src
),
#if !CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK
dst_block_config
.
data
,
__llvm_amdgcn_buffer_store_f32x2
(
*
reinterpret_cast
<
const
float2_t
*>
(
p_src_thread
),
dst_wave_config
.
data
,
0
,
0
,
dst_valid
?
(
dst_thread_addr_offset
+
dst_const_addr_offset
)
dst_data_valid
?
(
dst_thread_addr_offset
+
dst_const_addr_offset
)
:
0xffffffff
,
:
-
1
,
false
,
false
,
false
);
false
);
#else
index_t
dst_addr_base
=
dst_data_valid
?
0
:
0x7fffffff
;
__llvm_amdgcn_buffer_store_f32x2
(
*
reinterpret_cast
<
const
float2_t
*>
(
p_src_thread
),
dst_wave_config
.
data
,
0
,
dst_addr_base
+
dst_thread_addr_offset
+
dst_const_addr_offset
,
false
,
false
);
#endif
}
}
template
<
>
template
<
>
__device__
void
amd_buffer_store
<
float
,
4
>
(
const
float
*
p_src
,
__device__
void
amd_buffer_store
<
float
,
4
>
(
const
float
*
p_src
_thread
,
float
*
p_dst_
block
,
float
*
p_dst_
wave
,
index_t
dst_thread_data_offset
,
index_t
dst_thread_data_offset
,
index_t
dst_const_data_offset
,
index_t
dst_const_data_offset
,
bool
dst_valid
)
bool
dst_data_valid
,
index_t
dst_data_range
)
{
{
Buffer
AddressConfig
<
float
>
dst_
block
_config
;
Buffer
ResourceConstant
<
float
>
dst_
wave
_config
;
//
fill in byte 0 - 1
//
wavewise base address (64 bit)
dst_
block
_config
.
address
[
0
]
=
p_dst_
block
;
dst_
wave
_config
.
address
[
0
]
=
p_dst_
wave
;
//
fill in byte 2
//
wavewise range (32 bit)
dst_
block
_config
.
range
[
2
]
=
-
1
;
dst_
wave
_config
.
range
[
2
]
=
dst_data_range
*
sizeof
(
float
)
;
//
fill in byte 3
//
wavewise setting (32 bit)
dst_
block
_config
.
range
[
3
]
=
0x00027000
;
dst_
wave
_config
.
range
[
3
]
=
0x00027000
;
index_t
dst_thread_addr_offset
=
dst_thread_data_offset
*
sizeof
(
float
);
index_t
dst_thread_addr_offset
=
dst_thread_data_offset
*
sizeof
(
float
);
index_t
dst_const_addr_offset
=
dst_const_data_offset
*
sizeof
(
float
);
index_t
dst_const_addr_offset
=
dst_const_data_offset
*
sizeof
(
float
);
__llvm_amdgcn_buffer_store_f32x4
(
*
reinterpret_cast
<
const
float4_t
*>
(
p_src
),
#if !CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK
dst_block_config
.
data
,
__llvm_amdgcn_buffer_store_f32x4
(
*
reinterpret_cast
<
const
float4_t
*>
(
p_src_thread
),
dst_wave_config
.
data
,
0
,
dst_data_valid
?
(
dst_thread_addr_offset
+
dst_const_addr_offset
)
:
0xffffffff
,
false
,
false
);
#else
index_t
dst_addr_base
=
dst_data_valid
?
0
:
0x7fffffff
;
__llvm_amdgcn_buffer_store_f32x4
(
*
reinterpret_cast
<
const
float4_t
*>
(
p_src_thread
),
dst_wave_config
.
data
,
0
,
0
,
dst_valid
?
(
dst_thread_addr_offset
+
dst_const_addr_offset
)
dst_addr_base
+
dst_thread_addr_offset
+
dst_const_addr_offset
,
:
-
1
,
false
,
false
,
false
);
false
);
#endif
}
}
template
<
>
template
<
>
__device__
void
amd_buffer_store
<
half_t
,
1
>
(
const
half_t
*
p_src
,
__device__
void
amd_buffer_store
<
half_t
,
1
>
(
const
half_t
*
p_src
_thread
,
half_t
*
p_dst_
block
,
half_t
*
p_dst_
wave
,
index_t
dst_thread_data_offset
,
index_t
dst_thread_data_offset
,
index_t
dst_const_data_offset
,
index_t
dst_const_data_offset
,
bool
dst_valid
)
bool
dst_data_valid
,
index_t
dst_data_range
)
{
{
Buffer
AddressConfig
<
half_t
>
dst_
block
_config
;
Buffer
ResourceConstant
<
half_t
>
dst_
wave
_config
;
//
fill in byte 0 - 1
//
wavewise base address (64 bit)
dst_
block
_config
.
address
[
0
]
=
p_dst_
block
;
dst_
wave
_config
.
address
[
0
]
=
p_dst_
wave
;
//
fill in byte 2
//
wavewise range (32 bit)
dst_
block
_config
.
range
[
2
]
=
-
1
;
dst_
wave
_config
.
range
[
2
]
=
dst_data_range
*
sizeof
(
half_t
)
;
//
fill in byte 3
//
wavewise setting (32 bit)
dst_
block
_config
.
range
[
3
]
=
0x00027000
;
dst_
wave
_config
.
range
[
3
]
=
0x00027000
;
#if !CK_WORKAROUND_SWDEV_231101
#if !CK_WORKAROUND_SWDEV_231101
index_t
dst_thread_addr_offset
=
dst_thread_data_offset
*
sizeof
(
half_t
);
index_t
dst_thread_addr_offset
=
dst_thread_data_offset
*
sizeof
(
half_t
);
index_t
dst_const_addr_offset
=
dst_const_data_offset
*
sizeof
(
half_t
);
index_t
dst_const_addr_offset
=
dst_const_data_offset
*
sizeof
(
half_t
);
__llvm_amdgcn_buffer_store_f16
(
*
p_src
,
#if !CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK
dst_block_config
.
data
,
__llvm_amdgcn_buffer_store_f16
(
*
p_src_thread
,
dst_wave_config
.
data
,
0
,
dst_data_valid
?
(
dst_thread_addr_offset
+
dst_const_addr_offset
)
:
0xffffffff
,
false
,
false
);
#else
index_t
dst_addr_base
=
dst_data_valid
?
0
:
0x7fffffff
;
__llvm_amdgcn_buffer_store_f16
(
*
p_src_thread
,
dst_wave_config
.
data
,
0
,
0
,
dst_valid
?
(
dst_thread_addr_offset
+
dst_const_addr_offset
)
dst_addr_base
+
dst_thread_addr_offset
+
dst_const_addr_offset
,
:
-
1
,
false
,
false
,
false
);
false
);
#endif
#else
#else
if
(
dst_valid
)
if
(
dst_
data_
valid
)
{
{
p_dst_
block
[
dst_thread_data_offset
+
dst_const_data_offset
]
=
*
p_src
;
p_dst_
wave
[
dst_thread_data_offset
+
dst_const_data_offset
]
=
*
p_src
_thread
;
}
}
#endif
#endif
}
}
template
<
>
template
<
>
__device__
void
amd_buffer_store
<
half_t
,
2
>
(
const
half_t
*
p_src
,
__device__
void
amd_buffer_store
<
half_t
,
2
>
(
const
half_t
*
p_src
_thread
,
half_t
*
p_dst_
block
,
half_t
*
p_dst_
wave
,
index_t
dst_thread_data_offset
,
index_t
dst_thread_data_offset
,
index_t
dst_const_data_offset
,
index_t
dst_const_data_offset
,
bool
dst_valid
)
bool
dst_data_valid
,
index_t
dst_data_range
)
{
{
Buffer
AddressConfig
<
half_t
>
dst_
block
_config
;
Buffer
ResourceConstant
<
half_t
>
dst_
wave
_config
;
//
fill in byte 0 - 1
//
wavewise base address (64 bit)
dst_
block
_config
.
address
[
0
]
=
p_dst_
block
;
dst_
wave
_config
.
address
[
0
]
=
p_dst_
wave
;
//
fill in byte 2
//
wavewise range (32 bit)
dst_
block
_config
.
range
[
2
]
=
-
1
;
dst_
wave
_config
.
range
[
2
]
=
dst_data_range
*
sizeof
(
half_t
)
;
//
fill in byte 3
//
wavewise setting (32 bit)
dst_
block
_config
.
range
[
3
]
=
0x00027000
;
dst_
wave
_config
.
range
[
3
]
=
0x00027000
;
index_t
dst_thread_addr_offset
=
dst_thread_data_offset
*
sizeof
(
half_t
);
index_t
dst_thread_addr_offset
=
dst_thread_data_offset
*
sizeof
(
half_t
);
index_t
dst_const_addr_offset
=
dst_const_data_offset
*
sizeof
(
half_t
);
index_t
dst_const_addr_offset
=
dst_const_data_offset
*
sizeof
(
half_t
);
#if !CK_WORKAROUND_SWDEV_231101
const
float
*
p_src_tmp
=
reinterpret_cast
<
const
float
*>
(
p_src_thread
);
__llvm_amdgcn_buffer_store_f16x2
(
*
reinterpret_cast
<
const
half2_t
*>
(
p_src
),
dst_block_config
.
data
,
#if !CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK
__llvm_amdgcn_buffer_store_f32
(
*
p_src_tmp
,
dst_wave_config
.
data
,
0
,
0
,
dst_valid
?
(
dst_thread_addr_offset
+
dst_const_addr_offset
)
dst_
data_
valid
?
(
dst_thread_addr_offset
+
dst_const_addr_offset
)
:
-
1
,
:
0xffffffff
,
false
,
false
,
false
);
false
);
#else
#else
const
float
*
p_src_tmp
=
reinterpret_cast
<
const
float
*>
(
p_src
)
;
index_t
dst_addr_base
=
dst_data_valid
?
0
:
0x7fffffff
;
__llvm_amdgcn_buffer_store_f32
(
*
p_src_tmp
,
__llvm_amdgcn_buffer_store_f32
(
*
p_src_tmp
,
dst_
block
_config
.
data
,
dst_
wave
_config
.
data
,
0
,
0
,
dst_valid
?
(
dst_thread_addr_offset
+
dst_const_addr_offset
)
dst_addr_base
+
dst_thread_addr_offset
+
dst_const_addr_offset
,
:
-
1
,
false
,
false
,
false
);
false
);
#endif
#endif
}
}
template
<
>
template
<
>
__device__
void
amd_buffer_store
<
half_t
,
4
>
(
const
half_t
*
p_src
,
__device__
void
amd_buffer_store
<
half_t
,
4
>
(
const
half_t
*
p_src
_thread
,
half_t
*
p_dst_
block
,
half_t
*
p_dst_
wave
,
index_t
dst_thread_data_offset
,
index_t
dst_thread_data_offset
,
index_t
dst_const_data_offset
,
index_t
dst_const_data_offset
,
bool
dst_valid
)
bool
dst_data_valid
,
index_t
dst_data_range
)
{
{
index_t
dst_thread_addr_offset
=
dst_thread_data_offset
*
sizeof
(
half_t
);
index_t
dst_thread_addr_offset
=
dst_thread_data_offset
*
sizeof
(
half_t
);
index_t
dst_const_addr_offset
=
dst_const_data_offset
*
sizeof
(
half_t
);
index_t
dst_const_addr_offset
=
dst_const_data_offset
*
sizeof
(
half_t
);
Buffer
AddressConfig
<
half_t
>
dst_
block
_config
;
Buffer
ResourceConstant
<
half_t
>
dst_
wave
_config
;
//
fill in byte 0 - 1
//
wavewise base address (64 bit)
dst_
block
_config
.
address
[
0
]
=
p_dst_
block
;
dst_
wave
_config
.
address
[
0
]
=
p_dst_
wave
;
//
fill in byte 2
//
wavewise range (32 bit)
dst_
block
_config
.
range
[
2
]
=
-
1
;
dst_
wave
_config
.
range
[
2
]
=
dst_data_range
*
sizeof
(
half_t
)
;
//
fill in byte 3
//
wavewise setting (32 bit)
dst_
block
_config
.
range
[
3
]
=
0x00027000
;
dst_
wave
_config
.
range
[
3
]
=
0x00027000
;
#if !CK_WORKAROUND_SWDEV_231101
const
float2_t
*
p_src_tmp
=
reinterpret_cast
<
const
float2_t
*>
(
p_src_thread
);
__llvm_amdgcn_buffer_store_f16x4
(
*
reinterpret_cast
<
const
half4_t
*>
(
p_src
),
dst_block_config
.
data
,
#if !CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK
__llvm_amdgcn_buffer_store_f32x2
(
*
p_src_tmp
,
dst_wave_config
.
data
,
0
,
0
,
dst_valid
?
(
dst_thread_addr_offset
+
dst_const_addr_offset
)
dst_data_valid
?
(
dst_thread_addr_offset
+
dst_const_addr_offset
)
:
0xffffffff
,
:
-
1
,
false
,
false
,
false
);
false
);
#else
#else
const
float2_t
*
p_src_tmp
=
reinterpret_cast
<
const
float2_t
*>
(
p_src
)
;
index_t
dst_addr_base
=
dst_data_valid
?
0
:
0x7fffffff
;
__llvm_amdgcn_buffer_store_f32x2
(
*
p_src_tmp
,
__llvm_amdgcn_buffer_store_f32x2
(
*
p_src_tmp
,
dst_block_config
.
data
,
dst_wave_config
.
data
,
0
,
dst_addr_base
+
dst_thread_addr_offset
+
dst_const_addr_offset
,
false
,
false
);
#endif
}
template
<
>
__device__
void
amd_buffer_store
<
half_t
,
8
>
(
const
half_t
*
p_src_thread
,
half_t
*
p_dst_wave
,
index_t
dst_thread_data_offset
,
index_t
dst_const_data_offset
,
bool
dst_data_valid
,
index_t
dst_data_range
)
{
index_t
dst_thread_addr_offset
=
dst_thread_data_offset
*
sizeof
(
half_t
);
index_t
dst_const_addr_offset
=
dst_const_data_offset
*
sizeof
(
half_t
);
BufferResourceConstant
<
half_t
>
dst_wave_config
;
// wavewise base address (64 bit)
dst_wave_config
.
address
[
0
]
=
p_dst_wave
;
// wavewise range (32 bit)
dst_wave_config
.
range
[
2
]
=
dst_data_range
*
sizeof
(
half_t
);
// wavewise setting (32 bit)
dst_wave_config
.
range
[
3
]
=
0x00027000
;
const
float4_t
*
p_src_tmp
=
reinterpret_cast
<
const
float4_t
*>
(
p_src_thread
);
#if !CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK
__llvm_amdgcn_buffer_store_f32x4
(
*
p_src_tmp
,
dst_wave_config
.
data
,
0
,
0
,
dst_valid
?
(
dst_thread_addr_offset
+
dst_const_addr_offset
)
dst_data_valid
?
(
dst_thread_addr_offset
+
dst_const_addr_offset
)
:
0xffffffff
,
:
-
1
,
false
,
false
);
#else
index_t
dst_addr_base
=
dst_data_valid
?
0
:
0x7fffffff
;
__llvm_amdgcn_buffer_store_f32x4
(
*
p_src_tmp
,
dst_wave_config
.
data
,
0
,
dst_addr_base
+
dst_thread_addr_offset
+
dst_const_addr_offset
,
false
,
false
,
false
);
false
);
#endif
#endif
}
}
template
<
>
template
<
>
__device__
void
amd_buffer_store
<
ushort
,
1
>
(
const
ushort
*
p_src
,
__device__
void
amd_buffer_store
<
ushort
,
1
>
(
const
ushort
*
p_src
_thread
,
ushort
*
p_dst_
block
,
ushort
*
p_dst_
wave
,
index_t
dst_thread_data_offset
,
index_t
dst_thread_data_offset
,
index_t
dst_const_data_offset
,
index_t
dst_const_data_offset
,
bool
dst_valid
)
bool
dst_data_valid
,
index_t
dst_data_range
)
{
{
Buffer
AddressConfig
<
ushort
>
dst_
block
_config
;
Buffer
ResourceConstant
<
ushort
>
dst_
wave
_config
;
//
fill in byte 0 - 1
//
wavewise base address (64 bit)
dst_
block
_config
.
address
[
0
]
=
p_dst_
block
;
dst_
wave
_config
.
address
[
0
]
=
p_dst_
wave
;
//
fill in byte 2
//
wavewise range (32 bit)
dst_
block
_config
.
range
[
2
]
=
-
1
;
dst_
wave
_config
.
range
[
2
]
=
dst_data_range
*
sizeof
(
ushort
)
;
//
fill in byte 3
//
wavewise setting (32 bit)
dst_
block
_config
.
range
[
3
]
=
0x00027000
;
dst_
wave
_config
.
range
[
3
]
=
0x00027000
;
#if !CK_WORKAROUND_SWDEV_231101
#if !CK_WORKAROUND_SWDEV_231101
index_t
dst_thread_addr_offset
=
dst_thread_data_offset
*
sizeof
(
ushort
);
index_t
dst_thread_addr_offset
=
dst_thread_data_offset
*
sizeof
(
ushort
);
index_t
dst_const_addr_offset
=
dst_const_data_offset
*
sizeof
(
ushort
);
index_t
dst_const_addr_offset
=
dst_const_data_offset
*
sizeof
(
ushort
);
__llvm_amdgcn_buffer_store_bf16
(
*
p_src
,
#if !CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK
dst_block_config
.
data
,
__llvm_amdgcn_buffer_store_bf16
(
*
p_src_thread
,
dst_wave_config
.
data
,
0
,
dst_data_valid
?
(
dst_thread_addr_offset
+
dst_const_addr_offset
)
:
0xffffffff
,
false
,
false
);
#else
index_t
dst_addr_base
=
dst_data_valid
?
0
:
0x7fffffff
;
__llvm_amdgcn_buffer_store_bf16
(
*
p_src_thread
,
dst_wave_config
.
data
,
0
,
0
,
dst_valid
?
(
dst_thread_addr_offset
+
dst_const_addr_offset
)
dst_addr_base
+
dst_thread_addr_offset
+
dst_const_addr_offset
,
:
-
1
,
false
,
false
,
false
);
false
);
#endif
#else
#else
if
(
dst_valid
)
if
(
dst_
data_
valid
)
{
{
p_dst_
block
[
dst_thread_data_offset
+
dst_const_data_offset
]
=
*
p_src
;
p_dst_
wave
[
dst_thread_data_offset
+
dst_const_data_offset
]
=
*
p_src
_thread
;
}
}
#endif
#endif
}
}
template
<
>
template
<
>
__device__
void
amd_buffer_store
<
ushort
,
2
>
(
const
ushort
*
p_src
,
__device__
void
amd_buffer_store
<
ushort
,
2
>
(
const
ushort
*
p_src
_thread
,
ushort
*
p_dst_
block
,
ushort
*
p_dst_
wave
,
index_t
dst_thread_data_offset
,
index_t
dst_thread_data_offset
,
index_t
dst_const_data_offset
,
index_t
dst_const_data_offset
,
bool
dst_valid
)
bool
dst_data_valid
,
index_t
dst_data_range
)
{
{
Buffer
AddressConfig
<
ushort
>
dst_
block
_config
;
Buffer
ResourceConstant
<
ushort
>
dst_
wave
_config
;
//
fill in byte 0 - 1
//
wavewise base address (64 bit)
dst_
block
_config
.
address
[
0
]
=
p_dst_
block
;
dst_
wave
_config
.
address
[
0
]
=
p_dst_
wave
;
//
fill in byte 2
//
wavewise range (32 bit)
dst_
block
_config
.
range
[
2
]
=
-
1
;
dst_
wave
_config
.
range
[
2
]
=
dst_data_range
*
sizeof
(
ushort
)
;
//
fill in byte 3
//
wavewise setting (32 bit)
dst_
block
_config
.
range
[
3
]
=
0x00027000
;
dst_
wave
_config
.
range
[
3
]
=
0x00027000
;
index_t
dst_thread_addr_offset
=
dst_thread_data_offset
*
sizeof
(
ushort
);
index_t
dst_thread_addr_offset
=
dst_thread_data_offset
*
sizeof
(
ushort
);
index_t
dst_const_addr_offset
=
dst_const_data_offset
*
sizeof
(
ushort
);
index_t
dst_const_addr_offset
=
dst_const_data_offset
*
sizeof
(
ushort
);
#if !CK_WORKAROUND_SWDEV_231101
const
float
*
p_src_tmp
=
reinterpret_cast
<
const
float
*>
(
p_src_thread
);
__llvm_amdgcn_buffer_store_bf16x2
(
*
p_src
,
dst_block_config
.
data
,
#if !CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK
__llvm_amdgcn_buffer_store_f32
(
*
p_src_tmp
,
dst_wave_config
.
data
,
0
,
0
,
dst_valid
?
(
dst_thread_addr_offset
+
dst_const_addr_offset
)
dst
_data
_valid
?
(
dst_thread_addr_offset
+
dst_const_addr_offset
)
:
-
1
,
:
0xffffffff
,
false
,
false
,
false
);
false
);
#else
#else
const
float
*
p_src_tmp
=
reinterpret_cast
<
const
float
*>
(
p_src
)
;
index_t
dst_addr_base
=
dst_data_valid
?
0
:
0x7fffffff
;
__llvm_amdgcn_buffer_store_f32
(
*
p_src_tmp
,
__llvm_amdgcn_buffer_store_f32
(
*
p_src_tmp
,
dst_
block
_config
.
data
,
dst_
wave
_config
.
data
,
0
,
0
,
dst_valid
?
(
dst_thread_addr_offset
+
dst_const_addr_offset
)
dst_addr_base
+
dst_thread_addr_offset
+
dst_const_addr_offset
,
:
-
1
,
false
,
false
,
false
);
false
);
#endif
#endif
}
}
template
<
>
template
<
>
__device__
void
amd_buffer_store
<
ushort
,
4
>
(
const
ushort
*
p_src
,
__device__
void
amd_buffer_store
<
ushort
,
4
>
(
const
ushort
*
p_src
_thread
,
ushort
*
p_dst_
block
,
ushort
*
p_dst_
wave
,
index_t
dst_thread_data_offset
,
index_t
dst_thread_data_offset
,
index_t
dst_const_data_offset
,
index_t
dst_const_data_offset
,
bool
dst_valid
)
bool
dst_data_valid
,
index_t
dst_data_range
)
{
{
Buffer
AddressConfig
<
ushort
>
dst_
block
_config
;
Buffer
ResourceConstant
<
ushort
>
dst_
wave
_config
;
//
fill in byte 0 - 1
//
wavewise base address (64 bit)
dst_
block
_config
.
address
[
0
]
=
p_dst_
block
;
dst_
wave
_config
.
address
[
0
]
=
p_dst_
wave
;
//
fill in byte 2
//
wavewise range (32 bit)
dst_
block
_config
.
range
[
2
]
=
-
1
;
dst_
wave
_config
.
range
[
2
]
=
dst_data_range
*
sizeof
(
ushort
)
;
//
fill in byte 3
//
wavewise setting (32 bit)
dst_
block
_config
.
range
[
3
]
=
0x00027000
;
dst_
wave
_config
.
range
[
3
]
=
0x00027000
;
index_t
dst_thread_addr_offset
=
dst_thread_data_offset
*
sizeof
(
ushort
);
index_t
dst_thread_addr_offset
=
dst_thread_data_offset
*
sizeof
(
ushort
);
index_t
dst_const_addr_offset
=
dst_const_data_offset
*
sizeof
(
ushort
);
index_t
dst_const_addr_offset
=
dst_const_data_offset
*
sizeof
(
ushort
);
#if !CK_WORKAROUND_SWDEV_231101
const
float2_t
*
p_src_tmp
=
reinterpret_cast
<
const
float2_t
*>
(
p_src_thread
);
__llvm_amdgcn_buffer_store_bf16x4
(
*
p_src
,
dst_block_config
.
data
,
#if !CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK
__llvm_amdgcn_buffer_store_f32x2
(
*
p_src_tmp
,
dst_wave_config
.
data
,
0
,
0
,
dst_valid
?
(
dst_thread_addr_offset
+
dst_const_addr_offset
)
dst_data_valid
?
(
dst_thread_addr_offset
+
dst_const_addr_offset
)
:
0xffffffff
,
:
-
1
,
false
,
false
,
false
);
false
);
#else
#else
const
float2_t
*
p_src_tmp
=
reinterpret_cast
<
const
float2_t
*>
(
p_src
)
;
index_t
dst_addr_base
=
dst_data_valid
?
0
:
0x7fffffff
;
__llvm_amdgcn_buffer_store_f32x2
(
*
p_src_tmp
,
__llvm_amdgcn_buffer_store_f32x2
(
*
p_src_tmp
,
dst_
block
_config
.
data
,
dst_
wave
_config
.
data
,
0
,
0
,
dst_valid
?
(
dst_thread_addr_offset
+
dst_const_addr_offset
)
dst_addr_base
+
dst_thread_addr_offset
+
dst_const_addr_offset
,
:
-
1
,
false
,
false
,
false
);
false
);
#endif
#endif
}
}
template
<
>
template
<
>
__device__
void
amd_buffer_
a
to
mic_add
<
floa
t
,
1
>
(
const
floa
t
*
p_src
,
__device__
void
amd_buffer_
s
to
re
<
ushor
t
,
8
>
(
const
ushor
t
*
p_src
_thread
,
floa
t
*
p_dst_
block
,
ushor
t
*
p_dst_
wave
,
index_t
dst_thread_data_offset
,
index_t
dst_thread_data_offset
,
index_t
dst_const_data_offset
,
index_t
dst_const_data_offset
,
bool
dst_valid
)
bool
dst_data_valid
,
index_t
dst_data_range
)
{
{
Buffer
AddressConfig
<
floa
t
>
dst_
block
_config
;
Buffer
ResourceConstant
<
ushor
t
>
dst_
wave
_config
;
// fill in byte 0 - 1
// wavewise base address (64 bit)
dst_block_config
.
address
[
0
]
=
p_dst_block
;
dst_wave_config
.
address
[
0
]
=
p_dst_wave
;
// fill in byte 2
// wavewise range (32 bit)
dst_block_config
.
range
[
2
]
=
-
1
;
dst_wave_config
.
range
[
2
]
=
dst_data_range
*
sizeof
(
ushort
);
// fill in byte 3
// wavewise setting (32 bit)
dst_block_config
.
range
[
3
]
=
0x00027000
;
dst_wave_config
.
range
[
3
]
=
0x00027000
;
index_t
dst_thread_addr_offset
=
dst_thread_data_offset
*
sizeof
(
ushort
);
index_t
dst_const_addr_offset
=
dst_const_data_offset
*
sizeof
(
ushort
);
const
float4_t
*
p_src_tmp
=
reinterpret_cast
<
const
float4_t
*>
(
p_src_thread
);
#if !CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK
__llvm_amdgcn_buffer_store_f32x4
(
*
p_src_tmp
,
dst_wave_config
.
data
,
0
,
dst_data_valid
?
(
dst_thread_addr_offset
+
dst_const_addr_offset
)
:
0xffffffff
,
false
,
false
);
#else
index_t
dst_addr_base
=
dst_data_valid
?
0
:
0x7fffffff
;
__llvm_amdgcn_buffer_store_f32x4
(
*
p_src_tmp
,
dst_wave_config
.
data
,
0
,
dst_addr_base
+
dst_thread_addr_offset
+
dst_const_addr_offset
,
false
,
false
);
#endif
}
template
<
>
__device__
void
amd_buffer_atomic_add
<
float
,
1
>
(
const
float
*
p_src_thread
,
float
*
p_dst_wave
,
index_t
dst_thread_data_offset
,
index_t
dst_const_data_offset
,
bool
dst_data_valid
,
index_t
dst_data_range
)
{
BufferResourceConstant
<
float
>
dst_wave_config
;
// wavewise base address (64 bit)
dst_wave_config
.
address
[
0
]
=
p_dst_wave
;
// wavewise range (32 bit)
dst_wave_config
.
range
[
2
]
=
dst_data_range
*
sizeof
(
float
);
// wavewise setting (32 bit)
dst_wave_config
.
range
[
3
]
=
0x00027000
;
index_t
dst_thread_addr_offset
=
dst_thread_data_offset
*
sizeof
(
float
);
index_t
dst_thread_addr_offset
=
dst_thread_data_offset
*
sizeof
(
float
);
index_t
dst_const_addr_offset
=
dst_const_data_offset
*
sizeof
(
float
);
index_t
dst_const_addr_offset
=
dst_const_data_offset
*
sizeof
(
float
);
__llvm_amdgcn_buffer_atomic_add_f32
(
*
p_src
,
#if !CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK
dst_block_config
.
data
,
__llvm_amdgcn_buffer_atomic_add_f32
(
*
p_src_thread
,
dst_wave_config
.
data
,
0
,
0
,
dst_valid
?
(
dst_thread_addr_offset
+
dst_const_addr_offset
)
dst_data_valid
?
(
dst_thread_addr_offset
+
dst_const_addr_offset
)
:
0xffffffff
,
:
-
1
,
false
);
false
);
#else
index_t
dst_addr_base
=
dst_data_valid
?
0
:
0x7fffffff
;
__llvm_amdgcn_buffer_atomic_add_f32
(
*
p_src_thread
,
dst_wave_config
.
data
,
0
,
dst_addr_base
+
dst_thread_addr_offset
+
dst_const_addr_offset
,
false
);
#endif
}
}
template
<
>
template
<
>
__device__
void
amd_buffer_atomic_add
<
float
,
2
>
(
const
float
*
p_src
,
__device__
void
amd_buffer_atomic_add
<
float
,
2
>
(
const
float
*
p_src
_thread
,
float
*
p_dst_
block
,
float
*
p_dst_
wave
,
index_t
dst_thread_data_offset
,
index_t
dst_thread_data_offset
,
index_t
dst_const_data_offset
,
index_t
dst_const_data_offset
,
bool
dst_valid
)
bool
dst_data_valid
,
index_t
dst_data_range
)
{
{
Buffer
AddressConfig
<
float
>
dst_
block
_config
;
Buffer
ResourceConstant
<
float
>
dst_
wave
_config
;
//
fill in byte 0 - 1
//
wavewise base address (64 bit)
dst_
block
_config
.
address
[
0
]
=
p_dst_
block
;
dst_
wave
_config
.
address
[
0
]
=
p_dst_
wave
;
//
fill in byte 2
//
wavewise range (32 bit)
dst_
block
_config
.
range
[
2
]
=
-
1
;
dst_
wave
_config
.
range
[
2
]
=
dst_data_range
;
//
fill in byte 3
//
wavewise setting (32 bit)
dst_
block
_config
.
range
[
3
]
=
0x00027000
;
dst_
wave
_config
.
range
[
3
]
=
0x00027000
;
index_t
dst_thread_addr_offset
=
dst_thread_data_offset
*
sizeof
(
float
);
index_t
dst_thread_addr_offset
=
dst_thread_data_offset
*
sizeof
(
float
);
index_t
dst_const_addr_offset
=
dst_const_data_offset
*
sizeof
(
float
);
index_t
dst_const_addr_offset
=
dst_const_data_offset
*
sizeof
(
float
);
#if !CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK
for
(
index_t
i
=
0
;
i
<
2
;
++
i
)
for
(
index_t
i
=
0
;
i
<
2
;
++
i
)
{
{
__llvm_amdgcn_buffer_atomic_add_f32
(
__llvm_amdgcn_buffer_atomic_add_f32
(
p_src
[
i
],
p_src
_thread
[
i
],
dst_
block
_config
.
data
,
dst_
wave
_config
.
data
,
0
,
0
,
dst_valid
?
(
dst_thread_addr_offset
+
dst_const_addr_offset
+
i
*
sizeof
(
float
))
:
-
1
,
dst_data_valid
?
(
dst_thread_addr_offset
+
dst_const_addr_offset
+
i
*
sizeof
(
float
))
:
0xffffffff
,
false
);
false
);
}
}
#else
index_t
dst_addr_base
=
dst_data_valid
?
0
:
0x7fffffff
;
for
(
index_t
i
=
0
;
i
<
2
;
++
i
)
{
__llvm_amdgcn_buffer_atomic_add_f32
(
p_src_thread
[
i
],
dst_wave_config
.
data
,
0
,
dst_addr_base
+
dst_thread_addr_offset
+
dst_const_addr_offset
+
i
*
sizeof
(
float
),
false
);
}
#endif
}
}
template
<
>
template
<
>
__device__
void
amd_buffer_atomic_add
<
float
,
4
>
(
const
float
*
p_src
,
__device__
void
amd_buffer_atomic_add
<
float
,
4
>
(
const
float
*
p_src
_thread
,
float
*
p_dst_
block
,
float
*
p_dst_
wave
,
index_t
dst_thread_data_offset
,
index_t
dst_thread_data_offset
,
index_t
dst_const_data_offset
,
index_t
dst_const_data_offset
,
bool
dst_valid
)
bool
dst_data_valid
,
index_t
dst_data_range
)
{
{
Buffer
AddressConfig
<
float
>
dst_
block
_config
;
Buffer
ResourceConstant
<
float
>
dst_
wave
_config
;
//
fill in byte 0 - 1
//
wavewise base address (64 bit)
dst_
block
_config
.
address
[
0
]
=
p_dst_
block
;
dst_
wave
_config
.
address
[
0
]
=
p_dst_
wave
;
//
fill in byte 2
//
wavewise range (32 bit)
dst_
block
_config
.
range
[
2
]
=
-
1
;
dst_
wave
_config
.
range
[
2
]
=
dst_data_range
*
sizeof
(
float
)
;
//
fill in byte 3
//
wavewise setting (32 bit)
dst_
block
_config
.
range
[
3
]
=
0x00027000
;
dst_
wave
_config
.
range
[
3
]
=
0x00027000
;
index_t
dst_thread_addr_offset
=
dst_thread_data_offset
*
sizeof
(
float
);
index_t
dst_thread_addr_offset
=
dst_thread_data_offset
*
sizeof
(
float
);
index_t
dst_const_addr_offset
=
dst_const_data_offset
*
sizeof
(
float
);
index_t
dst_const_addr_offset
=
dst_const_data_offset
*
sizeof
(
float
);
#if !CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK
for
(
index_t
i
=
0
;
i
<
4
;
++
i
)
for
(
index_t
i
=
0
;
i
<
4
;
++
i
)
{
{
__llvm_amdgcn_buffer_atomic_add_f32
(
__llvm_amdgcn_buffer_atomic_add_f32
(
p_src
[
i
],
p_src_thread
[
i
],
dst_block_config
.
data
,
dst_wave_config
.
data
,
0
,
dst_data_valid
?
(
dst_thread_addr_offset
+
dst_const_addr_offset
+
i
*
sizeof
(
float
))
:
0xffffffff
,
false
);
}
#else
index_t
dst_addr_base
=
dst_data_valid
?
0
:
0x7fffffff
;
for
(
index_t
i
=
0
;
i
<
4
;
++
i
)
{
__llvm_amdgcn_buffer_atomic_add_f32
(
p_src_thread
[
i
],
dst_wave_config
.
data
,
0
,
0
,
dst_valid
?
(
dst_thread_addr_offset
+
dst_const_addr_offset
+
i
*
sizeof
(
float
))
:
-
1
,
dst_addr_base
+
dst_thread_addr_offset
+
dst_const_addr_offset
+
i
*
sizeof
(
float
),
false
);
false
);
}
}
#endif
}
}
}
// namespace ck
}
// namespace ck
...
...
composable_kernel/include/utility/config.amd.hpp.in
View file @
7a3d9697
...
@@ -49,12 +49,13 @@
...
@@ -49,12 +49,13 @@
#endif
#endif
// experimental implementation
// experimental implementation
#ifndef CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK
#define CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK 1
#endif
#ifndef CK_EXPERIMENTAL_BLOCKWISE_GEMM_USE_PIPELINE
#define CK_EXPERIMENTAL_BLOCKWISE_GEMM_USE_PIPELINE 1
#define CK_EXPERIMENTAL_BLOCKWISE_GEMM_USE_PIPELINE 1
#define CK_EXPERIMENTAL_TENSOR_COORDINATE_USE_CALCULATE_OFFSET_DIFF 0
#endif
#define CK_EXPERIMENTAL_THREADWISE_COPY_V4R2_USE_OPTIMIZED_ADDRESS_CACLULATION 0
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1 0
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R2 0
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2R1 0
#ifndef CK_EXPERIMENTAL_IMPLICIT_GEMM_BACKWARD_DATA_V4R1_OUTPUT_SKIP_OUT_OF_BOUND_CHECK
#ifndef CK_EXPERIMENTAL_IMPLICIT_GEMM_BACKWARD_DATA_V4R1_OUTPUT_SKIP_OUT_OF_BOUND_CHECK
#define CK_EXPERIMENTAL_IMPLICIT_GEMM_BACKWARD_DATA_V4R1_OUTPUT_SKIP_OUT_OF_BOUND_CHECK 0
#define CK_EXPERIMENTAL_IMPLICIT_GEMM_BACKWARD_DATA_V4R1_OUTPUT_SKIP_OUT_OF_BOUND_CHECK 0
...
...
composable_kernel/include/utility/in_memory_operation.amd.hpp.in
View file @
7a3d9697
...
@@ -50,9 +50,11 @@ struct SetData
...
@@ -50,9 +50,11 @@ struct SetData
__device__ void Run(const T* p_src,
__device__ void Run(const T* p_src,
index_t src_offset,
index_t src_offset,
bool src_valid,
bool src_valid,
index_t /* src_range */,
T* p_dst,
T* p_dst,
index_t dst_offset,
index_t dst_offset,
bool dst_valid) const
bool dst_valid,
index_t /* dst_range */) const
{
{
if(dst_valid)
if(dst_valid)
{
{
...
@@ -77,14 +79,16 @@ struct SetData
...
@@ -77,14 +79,16 @@ struct SetData
__device__ void Run<AddressSpace::Global, AddressSpace::Vgpr>(const T* p_src,
__device__ void Run<AddressSpace::Global, AddressSpace::Vgpr>(const T* p_src,
index_t src_offset,
index_t src_offset,
bool src_valid,
bool src_valid,
index_t src_range,
T* p_dst,
T* p_dst,
index_t dst_offset,
index_t dst_offset,
bool dst_valid) const
bool dst_valid,
index_t /* dst_range */) const
{
{
if(dst_valid)
if(dst_valid)
{
{
*reinterpret_cast<vector_t*>(&p_dst[dst_offset]) =
*reinterpret_cast<vector_t*>(&p_dst[dst_offset]) =
amd_buffer_load<T, DataPerAccess>(p_src, src_offset, 0, src_valid);
amd_buffer_load<T, DataPerAccess>(p_src, src_offset, 0, src_valid
, src_range
);
}
}
}
}
...
@@ -96,14 +100,16 @@ struct SetData
...
@@ -96,14 +100,16 @@ struct SetData
__device__ void Run<AddressSpace::Vgpr, AddressSpace::Global>(const T* p_src,
__device__ void Run<AddressSpace::Vgpr, AddressSpace::Global>(const T* p_src,
index_t src_offset,
index_t src_offset,
bool src_valid,
bool src_valid,
index_t /* src_range */,
T* p_dst,
T* p_dst,
index_t dst_offset,
index_t dst_offset,
bool dst_valid) const
bool dst_valid,
index_t dst_range) const
{
{
const auto zeros = vector_t(0);
const auto zeros = vector_t(0);
amd_buffer_store<T, DataPerAccess>(
amd_buffer_store<T, DataPerAccess>(
src_valid ? &(p_src[src_offset]) : &zeros, p_dst, dst_offset, 0, dst_valid);
src_valid ? &(p_src[src_offset]) : &zeros, p_dst, dst_offset, 0, dst_valid
, dst_range
);
}
}
#endif
#endif
};
};
...
@@ -118,9 +124,11 @@ struct AtomicAddData
...
@@ -118,9 +124,11 @@ struct AtomicAddData
__device__ void Run(const T* p_src,
__device__ void Run(const T* p_src,
index_t src_offset,
index_t src_offset,
bool src_valid,
bool src_valid,
index_t /* src_range */,
T* p_dst,
T* p_dst,
index_t dst_offset,
index_t dst_offset,
bool dst_valid) const
bool dst_valid,
index_t /* dst_range */) const
{
{
if(src_valid && dst_valid)
if(src_valid && dst_valid)
{
{
...
@@ -137,14 +145,20 @@ struct AtomicAddData
...
@@ -137,14 +145,20 @@ struct AtomicAddData
template <>
template <>
__device__ void Run<AddressSpace::Vgpr, AddressSpace::Global>(const T* p_src,
__device__ void Run<AddressSpace::Vgpr, AddressSpace::Global>(const T* p_src,
index_t src_offset,
index_t src_offset,
index_t /* src_range */,
bool src_valid T* p_dst,
bool src_valid T* p_dst,
index_t dst_offset,
index_t dst_offset,
bool dst_valid) const
bool dst_valid,
index_t dst_range) const
{
{
const auto zeros = vector_t(0);
const auto zeros = vector_t(0);
amd_buffer_atomic_add<T, DataPerAccess>(
amd_buffer_atomic_add<T, DataPerAccess>(src_valid ? &(p_src[src_offset]) : &zeros,
src_valid ? &(p_src[src_offset]) : &zeros, p_dst, dst_offset, 0, dst_valid);
p_dst,
dst_offset,
0,
dst_valid,
index_t dst_range);
}
}
#endif
#endif
};
};
...
@@ -159,9 +173,11 @@ template <typename T,
...
@@ -159,9 +173,11 @@ template <typename T,
__device__ void transfer_data(const T* p_src,
__device__ void transfer_data(const T* p_src,
index_t src_offset,
index_t src_offset,
bool src_valid,
bool src_valid,
index_t src_range,
T* p_dst,
T* p_dst,
index_t dst_offset,
index_t dst_offset,
bool dst_valid)
bool dst_valid,
index_t dst_range)
{
{
static_assert(DstInMemOp == InMemoryDataOperation::Set ||
static_assert(DstInMemOp == InMemoryDataOperation::Set ||
DstInMemOp == InMemoryDataOperation::AtomicAdd,
DstInMemOp == InMemoryDataOperation::AtomicAdd,
...
@@ -173,12 +189,12 @@ __device__ void transfer_data(const T* p_src,
...
@@ -173,12 +189,12 @@ __device__ void transfer_data(const T* p_src,
// TODO: use static_if::ElseIf
// TODO: use static_if::ElseIf
static_if<DstInMemOp == InMemoryDataOperation::Set>{}([&](auto) {
static_if<DstInMemOp == InMemoryDataOperation::Set>{}([&](auto) {
SetData<T, DataPerAccess>{}.template Run<SrcAddressSpace, DstAddressSpace>(
SetData<T, DataPerAccess>{}.template Run<SrcAddressSpace, DstAddressSpace>(
p_src, src_offset, src_valid, p_dst, dst_offset, dst_valid);
p_src, src_offset, src_valid,
src_range,
p_dst, dst_offset, dst_valid
, dst_range
);
});
});
static_if<DstInMemOp == InMemoryDataOperation::AtomicAdd>{}([&](auto) {
static_if<DstInMemOp == InMemoryDataOperation::AtomicAdd>{}([&](auto) {
AtomicAddData<T, DataPerAccess>{}.template Run<SrcAddressSpace, DstAddressSpace>(
AtomicAddData<T, DataPerAccess>{}.template Run<SrcAddressSpace, DstAddressSpace>(
p_src, src_offset, src_valid, p_dst, dst_offset, dst_valid);
p_src, src_offset, src_valid,
src_range,
p_dst, dst_offset, dst_valid
, dst_range
);
});
});
}
}
else
else
...
@@ -191,9 +207,11 @@ __device__ void transfer_data(const T* p_src,
...
@@ -191,9 +207,11 @@ __device__ void transfer_data(const T* p_src,
p_src,
p_src,
src_offset + i * SrcDataStride,
src_offset + i * SrcDataStride,
src_valid,
src_valid,
src_range,
p_dst,
p_dst,
dst_offset + i * DstDataStride,
dst_offset + i * DstDataStride,
dst_valid);
dst_valid,
dst_range);
});
});
static_if<DstInMemOp == InMemoryDataOperation::AtomicAdd>{}([&](auto) {
static_if<DstInMemOp == InMemoryDataOperation::AtomicAdd>{}([&](auto) {
...
@@ -201,9 +219,11 @@ __device__ void transfer_data(const T* p_src,
...
@@ -201,9 +219,11 @@ __device__ void transfer_data(const T* p_src,
p_src,
p_src,
src_offset + i * SrcDataStride,
src_offset + i * SrcDataStride,
src_valid,
src_valid,
src_range,
p_dst,
p_dst,
dst_offset + i * DstDataStride,
dst_offset + i * DstDataStride,
dst_valid);
dst_valid,
dst_range);
});
});
}
}
}
}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment