Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
435f5f91
Commit
435f5f91
authored
Jun 28, 2020
by
Chao Liu
Browse files
buffer APIs use combined wave and thread offset
parent
7a3d9697
Changes
2
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
351 additions
and
461 deletions
+351
-461
composable_kernel/include/utility/amd_buffer_addressing.hpp
composable_kernel/include/utility/amd_buffer_addressing.hpp
+342
-451
composable_kernel/include/utility/in_memory_operation.amd.hpp.in
...ble_kernel/include/utility/in_memory_operation.amd.hpp.in
+9
-10
No files found.
composable_kernel/include/utility/amd_buffer_addressing.hpp
View file @
435f5f91
...
...
@@ -13,6 +13,7 @@ union BufferResourceConstant
int32x4_t
data
;
T
*
address
[
2
];
int32_t
range
[
4
];
int32_t
config
[
4
];
};
__device__
float
__llvm_amdgcn_buffer_load_f32
(
int32x4_t
srsrc
,
...
...
@@ -153,8 +154,7 @@ template <typename T, index_t VectorSize>
__device__
typename
vector_type
<
T
,
VectorSize
>::
MemoryType
amd_buffer_load
(
const
T
*
p_src_wave
,
index_t
src_thread_data_offset
,
index_t
src_const_data_offset
,
bool
src_data_valid
,
bool
src_thread_data_valid
,
index_t
src_elemenst_space
);
// buffer_store requires:
...
...
@@ -165,8 +165,7 @@ template <typename T, index_t VectorSize>
__device__
void
amd_buffer_store
(
const
T
*
p_src_thread
,
T
*
p_dst_wave
,
index_t
dst_thread_data_offset
,
index_t
dst_const_data_offset
,
bool
dst_data_valid
,
bool
dst_thread_data_valid
,
index_t
dst_data_range
);
// buffer_atomic requires:
...
...
@@ -177,201 +176,170 @@ template <typename T, index_t VectorSize>
__device__
void
amd_buffer_atomic_add
(
const
T
*
p_src_thread
,
T
*
p_dst_wave
,
index_t
dst_thread_data_offset
,
index_t
dst_const_data_offset
,
bool
dst_data_valid
,
bool
dst_thread_data_valid
,
index_t
dst_data_range
);
template
<
>
__device__
float
amd_buffer_load
<
float
,
1
>
(
const
float
*
p_src_wave
,
index_t
src_thread_data_offset
,
index_t
src_const_data_offset
,
bool
src_data_valid
,
bool
src_thread_data_valid
,
index_t
src_data_range
)
{
BufferResourceConstant
<
float
>
src_wave_
config
;
BufferResourceConstant
<
float
>
src_wave_
buffer_resource
;
// wavewise base address (64 bit)
src_wave_
config
.
address
[
0
]
=
const_cast
<
float
*>
(
p_src_wave
);
src_wave_
buffer_resource
.
address
[
0
]
=
const_cast
<
float
*>
(
p_src_wave
);
// wavewise range (32 bit)
src_wave_
config
.
range
[
2
]
=
src_data_range
*
sizeof
(
float
);
src_wave_
buffer_resource
.
range
[
2
]
=
src_data_range
*
sizeof
(
float
);
// wavewise setting (32 bit)
src_wave_config
.
range
[
3
]
=
0x00027000
;
src_wave_
buffer_resource
.
config
[
3
]
=
0x00027000
;
index_t
src_thread_addr_offset
=
src_thread_data_offset
*
sizeof
(
float
);
index_t
src_const_addr_offset
=
src_const_data_offset
*
sizeof
(
float
);
#if !CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK
return
__llvm_amdgcn_buffer_load_f32
(
src_wave_config
.
data
,
return
__llvm_amdgcn_buffer_load_f32
(
src_wave_buffer_resource
.
data
,
0
,
src_data_valid
?
(
src_thread_addr_offset
+
src_const_addr_offset
)
:
0xffffffff
,
src_thread_data_valid
?
src_thread_addr_offset
:
0xffffffff
,
false
,
false
);
#else
index_t
src_addr_base
=
src_data_valid
?
0
:
0x7fffffff
;
index_t
src_addr_base
=
src_
thread_
data_valid
?
0
:
0x7fffffff
;
return
__llvm_amdgcn_buffer_load_f32
(
src_wave_config
.
data
,
0
,
src_addr_base
+
src_thread_addr_offset
+
src_const_addr_offset
,
false
,
false
);
return
__llvm_amdgcn_buffer_load_f32
(
src_wave_buffer_resource
.
data
,
0
,
src_addr_base
+
src_thread_addr_offset
,
false
,
false
);
#endif
}
template
<
>
__device__
float2_t
amd_buffer_load
<
float
,
2
>
(
const
float
*
p_src_wave
,
index_t
src_thread_data_offset
,
index_t
src_const_data_offset
,
bool
src_data_valid
,
bool
src_thread_data_valid
,
index_t
src_data_range
)
{
BufferResourceConstant
<
float
>
src_wave_
config
;
BufferResourceConstant
<
float
>
src_wave_
buffer_resource
;
// wavewise base address (64 bit)
src_wave_
config
.
address
[
0
]
=
const_cast
<
float
*>
(
p_src_wave
);
src_wave_
buffer_resource
.
address
[
0
]
=
const_cast
<
float
*>
(
p_src_wave
);
// wavewise range (32 bit)
src_wave_
config
.
range
[
2
]
=
src_data_range
*
sizeof
(
float
);
src_wave_
buffer_resource
.
range
[
2
]
=
src_data_range
*
sizeof
(
float
);
// wavewise setting (32 bit)
src_wave_config
.
range
[
3
]
=
0x00027000
;
src_wave_
buffer_resource
.
config
[
3
]
=
0x00027000
;
index_t
src_thread_addr_offset
=
src_thread_data_offset
*
sizeof
(
float
);
index_t
src_const_addr_offset
=
src_const_data_offset
*
sizeof
(
float
);
#if !CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK
return
__llvm_amdgcn_buffer_load_f32x2
(
src_wave_config
.
data
,
return
__llvm_amdgcn_buffer_load_f32x2
(
src_wave_buffer_resource
.
data
,
0
,
src_data_valid
?
(
src_thread_addr_offset
+
src_const_addr_offset
)
:
0xffffffff
,
src_thread_data_valid
?
src_thread_addr_offset
:
0xffffffff
,
false
,
false
);
#else
index_t
src_addr_base
=
src_data_valid
?
0
:
0x7fffffff
;
index_t
src_addr_base
=
src_
thread_
data_valid
?
0
:
0x7fffffff
;
return
__llvm_amdgcn_buffer_load_f32x2
(
src_wave_config
.
data
,
0
,
src_addr_base
+
src_thread_addr_offset
+
src_const_addr_offset
,
false
,
false
);
return
__llvm_amdgcn_buffer_load_f32x2
(
src_wave_buffer_resource
.
data
,
0
,
src_addr_base
+
src_thread_addr_offset
,
false
,
false
);
#endif
}
template
<
>
__device__
float4_t
amd_buffer_load
<
float
,
4
>
(
const
float
*
p_src_wave
,
index_t
src_thread_data_offset
,
index_t
src_const_data_offset
,
bool
src_data_valid
,
bool
src_thread_data_valid
,
index_t
src_data_range
)
{
BufferResourceConstant
<
float
>
src_wave_
config
;
BufferResourceConstant
<
float
>
src_wave_
buffer_resource
;
// wavewise base address (64 bit)
src_wave_
config
.
address
[
0
]
=
const_cast
<
float
*>
(
p_src_wave
);
src_wave_
buffer_resource
.
address
[
0
]
=
const_cast
<
float
*>
(
p_src_wave
);
// wavewise range (32 bit)
src_wave_
config
.
range
[
2
]
=
src_data_range
*
sizeof
(
float
);
src_wave_
buffer_resource
.
range
[
2
]
=
src_data_range
*
sizeof
(
float
);
// wavewise setting (32 bit)
src_wave_config
.
range
[
3
]
=
0x00027000
;
src_wave_
buffer_resource
.
config
[
3
]
=
0x00027000
;
index_t
src_thread_addr_offset
=
src_thread_data_offset
*
sizeof
(
float
);
index_t
src_const_addr_offset
=
src_const_data_offset
*
sizeof
(
float
);
#if !CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK
return
__llvm_amdgcn_buffer_load_f32x4
(
src_wave_config
.
data
,
return
__llvm_amdgcn_buffer_load_f32x4
(
src_wave_buffer_resource
.
data
,
0
,
src_data_valid
?
(
src_thread_addr_offset
+
src_const_addr_offset
)
:
0xffffffff
,
src_thread_data_valid
?
src_thread_addr_offset
:
0xffffffff
,
false
,
false
);
#else
index_t
src_addr_base
=
src_data_valid
?
0
:
0x7fffffff
;
index_t
src_addr_base
=
src_
thread_
data_valid
?
0
:
0x7fffffff
;
return
__llvm_amdgcn_buffer_load_f32x4
(
src_wave_config
.
data
,
0
,
src_addr_base
+
src_thread_addr_offset
+
src_const_addr_offset
,
false
,
false
);
return
__llvm_amdgcn_buffer_load_f32x4
(
src_wave_buffer_resource
.
data
,
0
,
src_addr_base
+
src_thread_addr_offset
,
false
,
false
);
#endif
}
template
<
>
__device__
half_t
amd_buffer_load
<
half_t
,
1
>
(
const
half_t
*
p_src_wave
,
index_t
src_thread_data_offset
,
index_t
src_const_data_offset
,
bool
src_data_valid
,
bool
src_thread_data_valid
,
index_t
src_data_range
)
{
BufferResourceConstant
<
half_t
>
src_wave_
config
;
BufferResourceConstant
<
half_t
>
src_wave_
buffer_resource
;
// wavewise base address (64 bit)
src_wave_
config
.
address
[
0
]
=
const_cast
<
half_t
*>
(
p_src_wave
);
src_wave_
buffer_resource
.
address
[
0
]
=
const_cast
<
half_t
*>
(
p_src_wave
);
// wavewise range (32 bit)
src_wave_
config
.
range
[
2
]
=
src_data_range
*
sizeof
(
half_t
);
src_wave_
buffer_resource
.
range
[
2
]
=
src_data_range
*
sizeof
(
half_t
);
// wavewise setting (32 bit)
src_wave_config
.
range
[
3
]
=
0x00027000
;
src_wave_
buffer_resource
.
config
[
3
]
=
0x00027000
;
#if !CK_WORKAROUND_SWDEV_231101
index_t
src_thread_addr_offset
=
src_thread_data_offset
*
sizeof
(
half_t
);
index_t
src_const_addr_offset
=
src_const_data_offset
*
sizeof
(
half_t
);
#if !CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK
return
__llvm_amdgcn_buffer_load_f16
(
src_wave_config
.
data
,
return
__llvm_amdgcn_buffer_load_f16
(
src_wave_buffer_resource
.
data
,
0
,
src_data_valid
?
(
src_thread_addr_offset
+
src_const_addr_offset
)
:
0xffffffff
,
src_thread_data_valid
?
src_thread_addr_offset
:
0xffffffff
,
false
,
false
);
#else
index_t
src_addr_base
=
src_data_valid
?
0
:
0x7fffffff
;
index_t
src_addr_base
=
src_
thread_
data_valid
?
0
:
0x7fffffff
;
return
__llvm_amdgcn_buffer_load_f16
(
src_wave_config
.
data
,
0
,
src_addr_base
+
src_thread_addr_offset
+
src_const_addr_offset
,
false
,
false
);
return
__llvm_amdgcn_buffer_load_f16
(
src_wave_buffer_resource
.
data
,
0
,
src_addr_base
+
src_thread_addr_offset
,
false
,
false
);
#endif
#else
return
src_data_valid
?
p_src_wave
[
src_thread_data_offset
+
src_const_data_offset
]
:
0
;
return
src_
thread_
data_valid
?
p_src_wave
[
src_thread_data_offset
]
:
0
;
#endif
}
template
<
>
__device__
half2_t
amd_buffer_load
<
half_t
,
2
>
(
const
half_t
*
p_src_wave
,
index_t
src_thread_data_offset
,
index_t
src_const_data_offset
,
bool
src_data_valid
,
bool
src_thread_data_valid
,
index_t
src_data_range
)
{
BufferResourceConstant
<
half_t
>
src_wave_
config
;
BufferResourceConstant
<
half_t
>
src_wave_
buffer_resource
;
// wavewise base address (64 bit)
src_wave_
config
.
address
[
0
]
=
const_cast
<
half_t
*>
(
p_src_wave
);
src_wave_
buffer_resource
.
address
[
0
]
=
const_cast
<
half_t
*>
(
p_src_wave
);
// wavewise range (32 bit)
src_wave_
config
.
range
[
2
]
=
src_data_range
*
sizeof
(
half_t
);
src_wave_
buffer_resource
.
range
[
2
]
=
src_data_range
*
sizeof
(
half_t
);
// wavewise setting (32 bit)
src_wave_config
.
range
[
3
]
=
0x00027000
;
src_wave_
buffer_resource
.
config
[
3
]
=
0x00027000
;
index_t
src_thread_addr_offset
=
src_thread_data_offset
*
sizeof
(
half_t
);
index_t
src_const_addr_offset
=
src_const_data_offset
*
sizeof
(
half_t
);
#if !CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK
float
dst_out_tmp
=
__llvm_amdgcn_buffer_load_f32
(
src_wave_config
.
data
,
float
dst_out_tmp
=
__llvm_amdgcn_buffer_load_f32
(
src_wave_buffer_resource
.
data
,
0
,
src_data_valid
?
(
src_thread_addr_offset
+
src_const_addr_offset
)
:
0xffffffff
,
src_
thread_
data_valid
?
src_thread_addr_offset
:
0xffffffff
,
false
,
false
);
#else
index_t
src_addr_base
=
src_data_valid
?
0
:
0x7fffffff
;
index_t
src_addr_base
=
src_
thread_
data_valid
?
0
:
0x7fffffff
;
float
dst_out_tmp
=
__llvm_amdgcn_buffer_load_f32
(
src_wave_config
.
data
,
0
,
src_addr_base
+
src_thread_addr_offset
+
src_const_addr_offset
,
false
,
false
);
float
dst_out_tmp
=
__llvm_amdgcn_buffer_load_f32
(
src_wave_buffer_resource
.
data
,
0
,
src_addr_base
+
src_thread_addr_offset
,
false
,
false
);
#endif
return
*
reinterpret_cast
<
half2_t
*>
(
&
dst_out_tmp
);
...
...
@@ -380,38 +348,32 @@ __device__ half2_t amd_buffer_load<half_t, 2>(const half_t* p_src_wave,
template
<
>
__device__
half4_t
amd_buffer_load
<
half_t
,
4
>
(
const
half_t
*
p_src_wave
,
index_t
src_thread_data_offset
,
index_t
src_const_data_offset
,
bool
src_data_valid
,
bool
src_thread_data_valid
,
index_t
src_data_range
)
{
BufferResourceConstant
<
half_t
>
src_wave_
config
;
BufferResourceConstant
<
half_t
>
src_wave_
buffer_resource
;
// wavewise base address (64 bit)
src_wave_
config
.
address
[
0
]
=
const_cast
<
half_t
*>
(
p_src_wave
);
src_wave_
buffer_resource
.
address
[
0
]
=
const_cast
<
half_t
*>
(
p_src_wave
);
// wavewise range (32 bit)
src_wave_
config
.
range
[
2
]
=
src_data_range
*
sizeof
(
half_t
);
src_wave_
buffer_resource
.
range
[
2
]
=
src_data_range
*
sizeof
(
half_t
);
// wavewise setting (32 bit)
src_wave_config
.
range
[
3
]
=
0x00027000
;
src_wave_
buffer_resource
.
config
[
3
]
=
0x00027000
;
index_t
src_thread_addr_offset
=
src_thread_data_offset
*
sizeof
(
half_t
);
index_t
src_const_addr_offset
=
src_const_data_offset
*
sizeof
(
half_t
);
#if !CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK
float2_t
dst_out_tmp
=
__llvm_amdgcn_buffer_load_f32x2
(
src_wave_config
.
data
,
float2_t
dst_out_tmp
=
__llvm_amdgcn_buffer_load_f32x2
(
src_wave_buffer_resource
.
data
,
0
,
src_data_valid
?
(
src_thread_addr_offset
+
src_const_addr_offset
)
:
0xffffffff
,
src_
thread_
data_valid
?
src_thread_addr_offset
:
0xffffffff
,
false
,
false
);
#else
index_t
src_addr_base
=
src_data_valid
?
0
:
0x7fffffff
;
index_t
src_addr_base
=
src_
thread_
data_valid
?
0
:
0x7fffffff
;
float2_t
dst_out_tmp
=
__llvm_amdgcn_buffer_load_f32x2
(
src_wave_config
.
data
,
0
,
src_addr_base
+
src_thread_addr_offset
+
src_const_addr_offset
,
false
,
false
);
float2_t
dst_out_tmp
=
__llvm_amdgcn_buffer_load_f32x2
(
src_wave_buffer_resource
.
data
,
0
,
src_addr_base
+
src_thread_addr_offset
,
false
,
false
);
#endif
return
*
reinterpret_cast
<
half4_t
*>
(
&
dst_out_tmp
);
...
...
@@ -420,38 +382,32 @@ __device__ half4_t amd_buffer_load<half_t, 4>(const half_t* p_src_wave,
template
<
>
__device__
half8_t
amd_buffer_load
<
half_t
,
8
>
(
const
half_t
*
p_src_wave
,
index_t
src_thread_data_offset
,
index_t
src_const_data_offset
,
bool
src_data_valid
,
bool
src_thread_data_valid
,
index_t
src_data_range
)
{
BufferResourceConstant
<
half_t
>
src_wave_
config
;
BufferResourceConstant
<
half_t
>
src_wave_
buffer_resource
;
// wavewise base address (64 bit)
src_wave_
config
.
address
[
0
]
=
const_cast
<
half_t
*>
(
p_src_wave
);
src_wave_
buffer_resource
.
address
[
0
]
=
const_cast
<
half_t
*>
(
p_src_wave
);
// wavewise range (32 bit)
src_wave_
config
.
range
[
2
]
=
src_data_range
*
sizeof
(
half_t
);
src_wave_
buffer_resource
.
range
[
2
]
=
src_data_range
*
sizeof
(
half_t
);
// wavewise setting (32 bit)
src_wave_config
.
range
[
3
]
=
0x00027000
;
src_wave_
buffer_resource
.
config
[
3
]
=
0x00027000
;
index_t
src_thread_addr_offset
=
src_thread_data_offset
*
sizeof
(
half_t
);
index_t
src_const_addr_offset
=
src_const_data_offset
*
sizeof
(
half_t
);
#if !CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK
float4_t
dst_out_tmp
=
__llvm_amdgcn_buffer_load_f32x4
(
src_wave_config
.
data
,
float4_t
dst_out_tmp
=
__llvm_amdgcn_buffer_load_f32x4
(
src_wave_buffer_resource
.
data
,
0
,
src_data_valid
?
(
src_thread_addr_offset
+
src_const_addr_offset
)
:
0xffffffff
,
src_
thread_
data_valid
?
src_thread_addr_offset
:
0xffffffff
,
false
,
false
);
#else
index_t
src_addr_base
=
src_data_valid
?
0
:
0x7fffffff
;
index_t
src_addr_base
=
src_
thread_
data_valid
?
0
:
0x7fffffff
;
float4_t
dst_out_tmp
=
__llvm_amdgcn_buffer_load_f32x4
(
src_wave_config
.
data
,
0
,
src_addr_base
+
src_thread_addr_offset
+
src_const_addr_offset
,
false
,
false
);
float4_t
dst_out_tmp
=
__llvm_amdgcn_buffer_load_f32x4
(
src_wave_buffer_resource
.
data
,
0
,
src_addr_base
+
src_thread_addr_offset
,
false
,
false
);
#endif
return
*
reinterpret_cast
<
half8_t
*>
(
&
dst_out_tmp
);
...
...
@@ -460,81 +416,69 @@ __device__ half8_t amd_buffer_load<half_t, 8>(const half_t* p_src_wave,
template
<
>
__device__
ushort
amd_buffer_load
<
ushort
,
1
>
(
const
ushort
*
p_src_wave
,
index_t
src_thread_data_offset
,
index_t
src_const_data_offset
,
bool
src_data_valid
,
bool
src_thread_data_valid
,
index_t
src_data_range
)
{
BufferResourceConstant
<
ushort
>
src_wave_
config
;
BufferResourceConstant
<
ushort
>
src_wave_
buffer_resource
;
// wavewise base address (64 bit)
src_wave_
config
.
address
[
0
]
=
const_cast
<
ushort
*>
(
p_src_wave
);
src_wave_
buffer_resource
.
address
[
0
]
=
const_cast
<
ushort
*>
(
p_src_wave
);
// wavewise range (32 bit)
src_wave_
config
.
range
[
2
]
=
src_data_range
*
sizeof
(
ushort
);
src_wave_
buffer_resource
.
range
[
2
]
=
src_data_range
*
sizeof
(
ushort
);
// wavewise setting (32 bit)
src_wave_config
.
range
[
3
]
=
0x00027000
;
src_wave_
buffer_resource
.
config
[
3
]
=
0x00027000
;
#if !CK_WORKAROUND_SWDEV_231101
index_t
src_thread_addr_offset
=
src_thread_data_offset
*
sizeof
(
ushort
);
index_t
src_const_addr_offset
=
src_const_data_offset
*
sizeof
(
ushort
);
#if !CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK
return
__llvm_amdgcn_buffer_load_bf16
(
src_wave_config
.
data
,
return
__llvm_amdgcn_buffer_load_bf16
(
src_wave_buffer_resource
.
data
,
0
,
src_data_valid
?
(
src_thread_addr_offset
+
src_const_addr_offset
)
:
0xffffffff
,
src_thread_data_valid
?
src_thread_addr_offset
:
0xffffffff
,
false
,
false
);
#else
index_t
src_addr_base
=
src_data_valid
?
0
:
0x7fffffff
;
index_t
src_addr_base
=
src_
thread_
data_valid
?
0
:
0x7fffffff
;
return
__llvm_amdgcn_buffer_load_bf16
(
src_wave_config
.
data
,
0
,
src_addr_base
+
src_thread_addr_offset
+
src_const_addr_offset
,
false
,
false
);
return
__llvm_amdgcn_buffer_load_bf16
(
src_wave_buffer_resource
.
data
,
0
,
src_addr_base
+
src_thread_addr_offset
,
false
,
false
);
#endif
#else
return
src_data_valid
?
p_src_wave
[
src_thread_data_offset
+
src_const_data_offset
]
:
0
;
return
src_
thread_
data_valid
?
p_src_wave
[
src_thread_data_offset
]
:
0
;
#endif
}
template
<
>
__device__
ushort2_t
amd_buffer_load
<
ushort
,
2
>
(
const
ushort
*
p_src_wave
,
index_t
src_thread_data_offset
,
index_t
src_const_data_offset
,
bool
src_data_valid
,
bool
src_thread_data_valid
,
index_t
src_data_range
)
{
BufferResourceConstant
<
ushort
>
src_wave_
config
;
BufferResourceConstant
<
ushort
>
src_wave_
buffer_resource
;
// wavewise base address (64 bit)
src_wave_
config
.
address
[
0
]
=
const_cast
<
ushort
*>
(
p_src_wave
);
src_wave_
buffer_resource
.
address
[
0
]
=
const_cast
<
ushort
*>
(
p_src_wave
);
// wavewise range (32 bit)
src_wave_
config
.
range
[
2
]
=
src_data_range
*
sizeof
(
ushort
);
src_wave_
buffer_resource
.
range
[
2
]
=
src_data_range
*
sizeof
(
ushort
);
// wavewise setting (32 bit)
src_wave_config
.
range
[
3
]
=
0x00027000
;
src_wave_
buffer_resource
.
config
[
3
]
=
0x00027000
;
index_t
src_thread_addr_offset
=
src_thread_data_offset
*
sizeof
(
ushort
);
index_t
src_const_addr_offset
=
src_const_data_offset
*
sizeof
(
ushort
);
#if !CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK
float
dst_out_tmp
=
__llvm_amdgcn_buffer_load_f32
(
src_wave_config
.
data
,
float
dst_out_tmp
=
__llvm_amdgcn_buffer_load_f32
(
src_wave_buffer_resource
.
data
,
0
,
src_data_valid
?
(
src_thread_addr_offset
+
src_const_addr_offset
)
:
0xffffffff
,
src_
thread_
data_valid
?
src_thread_addr_offset
:
0xffffffff
,
false
,
false
);
#else
index_t
src_addr_base
=
src_data_valid
?
0
:
0x7fffffff
;
index_t
src_addr_base
=
src_
thread_
data_valid
?
0
:
0x7fffffff
;
float
dst_out_tmp
=
__llvm_amdgcn_buffer_load_f32
(
src_wave_config
.
data
,
0
,
src_addr_base
+
src_thread_addr_offset
+
src_const_addr_offset
,
false
,
false
);
float
dst_out_tmp
=
__llvm_amdgcn_buffer_load_f32
(
src_wave_buffer_resource
.
data
,
0
,
src_addr_base
+
src_thread_addr_offset
,
false
,
false
);
#endif
return
*
reinterpret_cast
<
ushort2_t
*>
(
&
dst_out_tmp
);
...
...
@@ -543,38 +487,32 @@ __device__ ushort2_t amd_buffer_load<ushort, 2>(const ushort* p_src_wave,
template
<
>
__device__
ushort4_t
amd_buffer_load
<
ushort
,
4
>
(
const
ushort
*
p_src_wave
,
index_t
src_thread_data_offset
,
index_t
src_const_data_offset
,
bool
src_data_valid
,
bool
src_thread_data_valid
,
index_t
src_data_range
)
{
BufferResourceConstant
<
ushort
>
src_wave_
config
;
BufferResourceConstant
<
ushort
>
src_wave_
buffer_resource
;
// wavewise base address (64 bit)
src_wave_
config
.
address
[
0
]
=
const_cast
<
ushort
*>
(
p_src_wave
);
src_wave_
buffer_resource
.
address
[
0
]
=
const_cast
<
ushort
*>
(
p_src_wave
);
// wavewise range (32 bit)
src_wave_
config
.
range
[
2
]
=
src_data_range
*
sizeof
(
ushort
);
src_wave_
buffer_resource
.
range
[
2
]
=
src_data_range
*
sizeof
(
ushort
);
// wavewise setting (32 bit)
src_wave_config
.
range
[
3
]
=
0x00027000
;
src_wave_
buffer_resource
.
config
[
3
]
=
0x00027000
;
index_t
src_thread_addr_offset
=
src_thread_data_offset
*
sizeof
(
ushort
);
index_t
src_const_addr_offset
=
src_const_data_offset
*
sizeof
(
ushort
);
#if !CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK
float2_t
dst_out_tmp
=
__llvm_amdgcn_buffer_load_f32x2
(
src_wave_config
.
data
,
float2_t
dst_out_tmp
=
__llvm_amdgcn_buffer_load_f32x2
(
src_wave_buffer_resource
.
data
,
0
,
src_data_valid
?
(
src_thread_addr_offset
+
src_const_addr_offset
)
:
0xffffffff
,
src_
thread_
data_valid
?
src_thread_addr_offset
:
0xffffffff
,
false
,
false
);
#else
index_t
src_addr_base
=
src_data_valid
?
0
:
0x7fffffff
;
index_t
src_addr_base
=
src_
thread_
data_valid
?
0
:
0x7fffffff
;
float2_t
dst_out_tmp
=
__llvm_amdgcn_buffer_load_f32x2
(
src_wave_config
.
data
,
0
,
src_addr_base
+
src_thread_addr_offset
+
src_const_addr_offset
,
false
,
false
);
float2_t
dst_out_tmp
=
__llvm_amdgcn_buffer_load_f32x2
(
src_wave_buffer_resource
.
data
,
0
,
src_addr_base
+
src_thread_addr_offset
,
false
,
false
);
#endif
return
*
reinterpret_cast
<
ushort4_t
*>
(
&
dst_out_tmp
);
...
...
@@ -583,38 +521,32 @@ __device__ ushort4_t amd_buffer_load<ushort, 4>(const ushort* p_src_wave,
template
<
>
__device__
ushort8_t
amd_buffer_load
<
ushort
,
8
>
(
const
ushort
*
p_src_wave
,
index_t
src_thread_data_offset
,
index_t
src_const_data_offset
,
bool
src_data_valid
,
bool
src_thread_data_valid
,
index_t
src_data_range
)
{
BufferResourceConstant
<
ushort
>
src_wave_
config
;
BufferResourceConstant
<
ushort
>
src_wave_
buffer_resource
;
// wavewise base address (64 bit)
src_wave_
config
.
address
[
0
]
=
const_cast
<
ushort
*>
(
p_src_wave
);
src_wave_
buffer_resource
.
address
[
0
]
=
const_cast
<
ushort
*>
(
p_src_wave
);
// wavewise range (32 bit)
src_wave_
config
.
range
[
2
]
=
src_data_range
*
sizeof
(
ushort
);
src_wave_
buffer_resource
.
range
[
2
]
=
src_data_range
*
sizeof
(
ushort
);
// wavewise setting (32 bit)
src_wave_config
.
range
[
3
]
=
0x00027000
;
src_wave_
buffer_resource
.
config
[
3
]
=
0x00027000
;
index_t
src_thread_addr_offset
=
src_thread_data_offset
*
sizeof
(
ushort
);
index_t
src_const_addr_offset
=
src_const_data_offset
*
sizeof
(
ushort
);
#if !CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK
float4_t
dst_out_tmp
=
__llvm_amdgcn_buffer_load_f32x4
(
src_wave_config
.
data
,
float4_t
dst_out_tmp
=
__llvm_amdgcn_buffer_load_f32x4
(
src_wave_buffer_resource
.
data
,
0
,
src_data_valid
?
(
src_thread_addr_offset
+
src_const_addr_offset
)
:
0xffffffff
,
src_
thread_
data_valid
?
src_thread_addr_offset
:
0xffffffff
,
false
,
false
);
#else
index_t
src_addr_base
=
src_data_valid
?
0
:
0x7fffffff
;
index_t
src_addr_base
=
src_
thread_
data_valid
?
0
:
0x7fffffff
;
float4_t
dst_out_tmp
=
__llvm_amdgcn_buffer_load_f32x4
(
src_wave_config
.
data
,
0
,
src_addr_base
+
src_thread_addr_offset
+
src_const_addr_offset
,
false
,
false
);
float4_t
dst_out_tmp
=
__llvm_amdgcn_buffer_load_f32x4
(
src_wave_buffer_resource
.
data
,
0
,
src_addr_base
+
src_thread_addr_offset
,
false
,
false
);
#endif
return
*
reinterpret_cast
<
ushort8_t
*>
(
&
dst_out_tmp
);
...
...
@@ -624,37 +556,34 @@ template <>
__device__
void
amd_buffer_store
<
float
,
1
>
(
const
float
*
p_src_thread
,
float
*
p_dst_wave
,
index_t
dst_thread_data_offset
,
index_t
dst_const_data_offset
,
bool
dst_data_valid
,
bool
dst_thread_data_valid
,
index_t
dst_data_range
)
{
BufferResourceConstant
<
float
>
dst_wave_
config
;
BufferResourceConstant
<
float
>
dst_wave_
buffer_resource
;
// wavewise base address (64 bit)
dst_wave_
config
.
address
[
0
]
=
p_dst_wave
;
dst_wave_
buffer_resource
.
address
[
0
]
=
p_dst_wave
;
// wavewise range (32 bit)
dst_wave_
config
.
range
[
2
]
=
dst_data_range
*
sizeof
(
float
);
dst_wave_
buffer_resource
.
range
[
2
]
=
dst_data_range
*
sizeof
(
float
);
// wavewise setting (32 bit)
dst_wave_config
.
range
[
3
]
=
0x00027000
;
dst_wave_
buffer_resource
.
config
[
3
]
=
0x00027000
;
index_t
dst_thread_addr_offset
=
dst_thread_data_offset
*
sizeof
(
float
);
index_t
dst_const_addr_offset
=
dst_const_data_offset
*
sizeof
(
float
);
#if !CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK
__llvm_amdgcn_buffer_store_f32
(
*
p_src_thread
,
dst_wave_
config
.
data
,
dst_wave_
buffer_resource
.
data
,
0
,
dst_data_valid
?
(
dst_thread_addr_offset
+
dst_const_addr_offset
)
:
0xffffffff
,
dst_thread_data_valid
?
dst_thread_addr_offset
:
0xffffffff
,
false
,
false
);
#else
index_t
dst_addr_base
=
dst_data_valid
?
0
:
0x7fffffff
;
index_t
dst_addr_base
=
dst_
thread_
data_valid
?
0
:
0x7fffffff
;
__llvm_amdgcn_buffer_store_f32
(
*
p_src_thread
,
dst_wave_
config
.
data
,
dst_wave_
buffer_resource
.
data
,
0
,
dst_addr_base
+
dst_thread_addr_offset
+
dst_const_addr_offset
,
dst_addr_base
+
dst_thread_addr_offset
,
false
,
false
);
#endif
...
...
@@ -664,37 +593,34 @@ template <>
__device__
void
amd_buffer_store
<
float
,
2
>
(
const
float
*
p_src_thread
,
float
*
p_dst_wave
,
index_t
dst_thread_data_offset
,
index_t
dst_const_data_offset
,
bool
dst_data_valid
,
bool
dst_thread_data_valid
,
index_t
dst_data_range
)
{
BufferResourceConstant
<
float
>
dst_wave_
config
;
BufferResourceConstant
<
float
>
dst_wave_
buffer_resource
;
// wavewise base address (64 bit)
dst_wave_
config
.
address
[
0
]
=
p_dst_wave
;
dst_wave_
buffer_resource
.
address
[
0
]
=
p_dst_wave
;
// wavewise range (32 bit)
dst_wave_
config
.
range
[
2
]
=
dst_data_range
*
sizeof
(
float
);
dst_wave_
buffer_resource
.
range
[
2
]
=
dst_data_range
*
sizeof
(
float
);
// wavewise setting (32 bit)
dst_wave_config
.
range
[
3
]
=
0x00027000
;
dst_wave_
buffer_resource
.
config
[
3
]
=
0x00027000
;
index_t
dst_thread_addr_offset
=
dst_thread_data_offset
*
sizeof
(
float
);
index_t
dst_const_addr_offset
=
dst_const_data_offset
*
sizeof
(
float
);
#if !CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK
__llvm_amdgcn_buffer_store_f32x2
(
*
reinterpret_cast
<
const
float2_t
*>
(
p_src_thread
),
dst_wave_config
.
data
,
__llvm_amdgcn_buffer_store_f32x2
(
*
reinterpret_cast
<
const
float2_t
*>
(
p_src_thread
),
dst_wave_buffer_resource
.
data
,
0
,
dst_data_valid
?
(
dst_thread_addr_offset
+
dst_const_addr_offset
)
:
0xffffffff
,
dst_
thread_
data_valid
?
dst_thread_addr_offset
:
0xffffffff
,
false
,
false
);
#else
index_t
dst_addr_base
=
dst_data_valid
?
0
:
0x7fffffff
;
index_t
dst_addr_base
=
dst_
thread_
data_valid
?
0
:
0x7fffffff
;
__llvm_amdgcn_buffer_store_f32x2
(
*
reinterpret_cast
<
const
float2_t
*>
(
p_src_thread
),
dst_wave_
config
.
data
,
dst_wave_
buffer_resource
.
data
,
0
,
dst_addr_base
+
dst_thread_addr_offset
+
dst_const_addr_offset
,
dst_addr_base
+
dst_thread_addr_offset
,
false
,
false
);
#endif
...
...
@@ -704,37 +630,34 @@ template <>
__device__
void
amd_buffer_store
<
float
,
4
>
(
const
float
*
p_src_thread
,
float
*
p_dst_wave
,
index_t
dst_thread_data_offset
,
index_t
dst_const_data_offset
,
bool
dst_data_valid
,
bool
dst_thread_data_valid
,
index_t
dst_data_range
)
{
BufferResourceConstant
<
float
>
dst_wave_
config
;
BufferResourceConstant
<
float
>
dst_wave_
buffer_resource
;
// wavewise base address (64 bit)
dst_wave_
config
.
address
[
0
]
=
p_dst_wave
;
dst_wave_
buffer_resource
.
address
[
0
]
=
p_dst_wave
;
// wavewise range (32 bit)
dst_wave_
config
.
range
[
2
]
=
dst_data_range
*
sizeof
(
float
);
dst_wave_
buffer_resource
.
range
[
2
]
=
dst_data_range
*
sizeof
(
float
);
// wavewise setting (32 bit)
dst_wave_config
.
range
[
3
]
=
0x00027000
;
dst_wave_
buffer_resource
.
config
[
3
]
=
0x00027000
;
index_t
dst_thread_addr_offset
=
dst_thread_data_offset
*
sizeof
(
float
);
index_t
dst_const_addr_offset
=
dst_const_data_offset
*
sizeof
(
float
);
#if !CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK
__llvm_amdgcn_buffer_store_f32x4
(
*
reinterpret_cast
<
const
float4_t
*>
(
p_src_thread
),
dst_wave_config
.
data
,
__llvm_amdgcn_buffer_store_f32x4
(
*
reinterpret_cast
<
const
float4_t
*>
(
p_src_thread
),
dst_wave_buffer_resource
.
data
,
0
,
dst_data_valid
?
(
dst_thread_addr_offset
+
dst_const_addr_offset
)
:
0xffffffff
,
dst_
thread_
data_valid
?
dst_thread_addr_offset
:
0xffffffff
,
false
,
false
);
#else
index_t
dst_addr_base
=
dst_data_valid
?
0
:
0x7fffffff
;
index_t
dst_addr_base
=
dst_
thread_
data_valid
?
0
:
0x7fffffff
;
__llvm_amdgcn_buffer_store_f32x4
(
*
reinterpret_cast
<
const
float4_t
*>
(
p_src_thread
),
dst_wave_
config
.
data
,
dst_wave_
buffer_resource
.
data
,
0
,
dst_addr_base
+
dst_thread_addr_offset
+
dst_const_addr_offset
,
dst_addr_base
+
dst_thread_addr_offset
,
false
,
false
);
#endif
...
...
@@ -744,46 +667,43 @@ template <>
__device__
void
amd_buffer_store
<
half_t
,
1
>
(
const
half_t
*
p_src_thread
,
half_t
*
p_dst_wave
,
index_t
dst_thread_data_offset
,
index_t
dst_const_data_offset
,
bool
dst_data_valid
,
bool
dst_thread_data_valid
,
index_t
dst_data_range
)
{
BufferResourceConstant
<
half_t
>
dst_wave_
config
;
BufferResourceConstant
<
half_t
>
dst_wave_
buffer_resource
;
// wavewise base address (64 bit)
dst_wave_
config
.
address
[
0
]
=
p_dst_wave
;
dst_wave_
buffer_resource
.
address
[
0
]
=
p_dst_wave
;
// wavewise range (32 bit)
dst_wave_
config
.
range
[
2
]
=
dst_data_range
*
sizeof
(
half_t
);
dst_wave_
buffer_resource
.
range
[
2
]
=
dst_data_range
*
sizeof
(
half_t
);
// wavewise setting (32 bit)
dst_wave_config
.
range
[
3
]
=
0x00027000
;
dst_wave_
buffer_resource
.
config
[
3
]
=
0x00027000
;
#if !CK_WORKAROUND_SWDEV_231101
index_t
dst_thread_addr_offset
=
dst_thread_data_offset
*
sizeof
(
half_t
);
index_t
dst_const_addr_offset
=
dst_const_data_offset
*
sizeof
(
half_t
);
#if !CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK
__llvm_amdgcn_buffer_store_f16
(
*
p_src_thread
,
dst_wave_
config
.
data
,
dst_wave_
buffer_resource
.
data
,
0
,
dst_data_valid
?
(
dst_thread_addr_offset
+
dst_const_addr_offset
)
:
0xffffffff
,
dst_thread_data_valid
?
dst_thread_addr_offset
:
0xffffffff
,
false
,
false
);
#else
index_t
dst_addr_base
=
dst_data_valid
?
0
:
0x7fffffff
;
index_t
dst_addr_base
=
dst_
thread_
data_valid
?
0
:
0x7fffffff
;
__llvm_amdgcn_buffer_store_f16
(
*
p_src_thread
,
dst_wave_
config
.
data
,
dst_wave_
buffer_resource
.
data
,
0
,
dst_addr_base
+
dst_thread_addr_offset
+
dst_const_addr_offset
,
dst_addr_base
+
dst_thread_addr_offset
,
false
,
false
);
#endif
#else
if
(
dst_data_valid
)
if
(
dst_
thread_
data_valid
)
{
p_dst_wave
[
dst_thread_data_offset
+
dst_const_data_offset
]
=
*
p_src_thread
;
p_dst_wave
[
dst_thread_data_offset
]
=
*
p_src_thread
;
}
#endif
}
...
...
@@ -792,39 +712,36 @@ template <>
__device__
void
amd_buffer_store
<
half_t
,
2
>
(
const
half_t
*
p_src_thread
,
half_t
*
p_dst_wave
,
index_t
dst_thread_data_offset
,
index_t
dst_const_data_offset
,
bool
dst_data_valid
,
bool
dst_thread_data_valid
,
index_t
dst_data_range
)
{
BufferResourceConstant
<
half_t
>
dst_wave_
config
;
BufferResourceConstant
<
half_t
>
dst_wave_
buffer_resource
;
// wavewise base address (64 bit)
dst_wave_
config
.
address
[
0
]
=
p_dst_wave
;
dst_wave_
buffer_resource
.
address
[
0
]
=
p_dst_wave
;
// wavewise range (32 bit)
dst_wave_
config
.
range
[
2
]
=
dst_data_range
*
sizeof
(
half_t
);
dst_wave_
buffer_resource
.
range
[
2
]
=
dst_data_range
*
sizeof
(
half_t
);
// wavewise setting (32 bit)
dst_wave_config
.
range
[
3
]
=
0x00027000
;
dst_wave_
buffer_resource
.
config
[
3
]
=
0x00027000
;
index_t
dst_thread_addr_offset
=
dst_thread_data_offset
*
sizeof
(
half_t
);
index_t
dst_const_addr_offset
=
dst_const_data_offset
*
sizeof
(
half_t
);
const
float
*
p_src_tmp
=
reinterpret_cast
<
const
float
*>
(
p_src_thread
);
#if !CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK
__llvm_amdgcn_buffer_store_f32
(
*
p_src_tmp
,
dst_wave_
config
.
data
,
dst_wave_
buffer_resource
.
data
,
0
,
dst_data_valid
?
(
dst_thread_addr_offset
+
dst_const_addr_offset
)
:
0xffffffff
,
dst_thread_data_valid
?
dst_thread_addr_offset
:
0xffffffff
,
false
,
false
);
#else
index_t
dst_addr_base
=
dst_data_valid
?
0
:
0x7fffffff
;
index_t
dst_addr_base
=
dst_
thread_
data_valid
?
0
:
0x7fffffff
;
__llvm_amdgcn_buffer_store_f32
(
*
p_src_tmp
,
dst_wave_
config
.
data
,
dst_wave_
buffer_resource
.
data
,
0
,
dst_addr_base
+
dst_thread_addr_offset
+
dst_const_addr_offset
,
dst_addr_base
+
dst_thread_addr_offset
,
false
,
false
);
#endif
...
...
@@ -834,39 +751,38 @@ template <>
__device__
void
amd_buffer_store
<
half_t
,
4
>
(
const
half_t
*
p_src_thread
,
half_t
*
p_dst_wave
,
index_t
dst_thread_data_offset
,
index_t
dst_const_data_offset
,
bool
dst_data_valid
,
bool
dst_thread_data_valid
,
index_t
dst_data_range
)
{
index_t
dst_thread_addr_offset
=
dst_thread_data_offset
*
sizeof
(
half_t
);
index_t
dst_const_addr_offset
=
dst_const_data_offset
*
sizeof
(
half_t
);
BufferResourceConstant
<
half_t
>
dst_wave_config
;
BufferResourceConstant
<
half_t
>
dst_wave_buffer_resource
;
// wavewise base address (64 bit)
dst_wave_
config
.
address
[
0
]
=
p_dst_wave
;
dst_wave_
buffer_resource
.
address
[
0
]
=
p_dst_wave
;
// wavewise range (32 bit)
dst_wave_
config
.
range
[
2
]
=
dst_data_range
*
sizeof
(
half_t
);
dst_wave_
buffer_resource
.
range
[
2
]
=
dst_data_range
*
sizeof
(
half_t
);
// wavewise setting (32 bit)
dst_wave_config
.
range
[
3
]
=
0x00027000
;
dst_wave_buffer_resource
.
config
[
3
]
=
0x00027000
;
index_t
dst_thread_addr_offset
=
dst_thread_data_offset
*
sizeof
(
half_t
);
const
float2_t
*
p_src_tmp
=
reinterpret_cast
<
const
float2_t
*>
(
p_src_thread
);
#if !CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK
__llvm_amdgcn_buffer_store_f32x2
(
*
p_src_tmp
,
dst_wave_config
.
data
,
__llvm_amdgcn_buffer_store_f32x2
(
*
p_src_tmp
,
dst_wave_buffer_resource
.
data
,
0
,
dst_data_valid
?
(
dst_thread_addr_offset
+
dst_const_addr_offset
)
:
0xffffffff
,
dst_thread_data_valid
?
dst_thread_addr_offset
,
:
0xffffffff
,
false
,
false
);
#else
index_t
dst_addr_base
=
dst_data_valid
?
0
:
0x7fffffff
;
index_t
dst_addr_base
=
dst_
thread_
data_valid
?
0
:
0x7fffffff
;
__llvm_amdgcn_buffer_store_f32x2
(
*
p_src_tmp
,
dst_wave_
config
.
data
,
dst_wave_
buffer_resource
.
data
,
0
,
dst_addr_base
+
dst_thread_addr_offset
+
dst_const_addr_offset
,
dst_addr_base
+
dst_thread_addr_offset
,
false
,
false
);
#endif
...
...
@@ -876,39 +792,36 @@ template <>
__device__
void
amd_buffer_store
<
half_t
,
8
>
(
const
half_t
*
p_src_thread
,
half_t
*
p_dst_wave
,
index_t
dst_thread_data_offset
,
index_t
dst_const_data_offset
,
bool
dst_data_valid
,
bool
dst_thread_data_valid
,
index_t
dst_data_range
)
{
index_t
dst_thread_addr_offset
=
dst_thread_data_offset
*
sizeof
(
half_t
);
index_t
dst_const_addr_offset
=
dst_const_data_offset
*
sizeof
(
half_t
);
BufferResourceConstant
<
half_t
>
dst_wave_config
;
BufferResourceConstant
<
half_t
>
dst_wave_buffer_resource
;
// wavewise base address (64 bit)
dst_wave_
config
.
address
[
0
]
=
p_dst_wave
;
dst_wave_
buffer_resource
.
address
[
0
]
=
p_dst_wave
;
// wavewise range (32 bit)
dst_wave_
config
.
range
[
2
]
=
dst_data_range
*
sizeof
(
half_t
);
dst_wave_
buffer_resource
.
range
[
2
]
=
dst_data_range
*
sizeof
(
half_t
);
// wavewise setting (32 bit)
dst_wave_config
.
range
[
3
]
=
0x00027000
;
dst_wave_buffer_resource
.
config
[
3
]
=
0x00027000
;
index_t
dst_thread_addr_offset
=
dst_thread_data_offset
*
sizeof
(
half_t
);
const
float4_t
*
p_src_tmp
=
reinterpret_cast
<
const
float4_t
*>
(
p_src_thread
);
#if !CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK
__llvm_amdgcn_buffer_store_f32x4
(
*
p_src_tmp
,
dst_wave_config
.
data
,
__llvm_amdgcn_buffer_store_f32x4
(
*
p_src_tmp
,
dst_wave_buffer_resource
.
data
,
0
,
dst_data_valid
?
(
dst_thread_addr_offset
+
dst_const_addr_offset
)
:
0xffffffff
,
dst_
thread_
data_valid
?
dst_thread_addr_offset
:
0xffffffff
,
false
,
false
);
#else
index_t
dst_addr_base
=
dst_data_valid
?
0
:
0x7fffffff
;
index_t
dst_addr_base
=
dst_
thread_
data_valid
?
0
:
0x7fffffff
;
__llvm_amdgcn_buffer_store_f32x4
(
*
p_src_tmp
,
dst_wave_
config
.
data
,
dst_wave_
buffer_resource
.
data
,
0
,
dst_addr_base
+
dst_thread_addr_offset
+
dst_const_addr_offset
,
dst_addr_base
+
dst_thread_addr_offset
,
false
,
false
);
#endif
...
...
@@ -918,46 +831,43 @@ template <>
__device__
void
amd_buffer_store
<
ushort
,
1
>
(
const
ushort
*
p_src_thread
,
ushort
*
p_dst_wave
,
index_t
dst_thread_data_offset
,
index_t
dst_const_data_offset
,
bool
dst_data_valid
,
bool
dst_thread_data_valid
,
index_t
dst_data_range
)
{
BufferResourceConstant
<
ushort
>
dst_wave_
config
;
BufferResourceConstant
<
ushort
>
dst_wave_
buffer_resource
;
// wavewise base address (64 bit)
dst_wave_
config
.
address
[
0
]
=
p_dst_wave
;
dst_wave_
buffer_resource
.
address
[
0
]
=
p_dst_wave
;
// wavewise range (32 bit)
dst_wave_
config
.
range
[
2
]
=
dst_data_range
*
sizeof
(
ushort
);
dst_wave_
buffer_resource
.
range
[
2
]
=
dst_data_range
*
sizeof
(
ushort
);
// wavewise setting (32 bit)
dst_wave_config
.
range
[
3
]
=
0x00027000
;
dst_wave_
buffer_resource
.
config
[
3
]
=
0x00027000
;
#if !CK_WORKAROUND_SWDEV_231101
index_t
dst_thread_addr_offset
=
dst_thread_data_offset
*
sizeof
(
ushort
);
index_t
dst_const_addr_offset
=
dst_const_data_offset
*
sizeof
(
ushort
);
#if !CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK
__llvm_amdgcn_buffer_store_bf16
(
*
p_src_thread
,
dst_wave_config
.
data
,
__llvm_amdgcn_buffer_store_bf16
(
*
p_src_thread
,
dst_wave_buffer_resource
.
data
,
0
,
dst_data_valid
?
(
dst_thread_addr_offset
+
dst_const_addr_offset
)
:
0xffffffff
,
dst_
thread_
data_valid
?
dst_thread_addr_offset
:
0xffffffff
,
false
,
false
);
#else
index_t
dst_addr_base
=
dst_data_valid
?
0
:
0x7fffffff
;
index_t
dst_addr_base
=
dst_
thread_
data_valid
?
0
:
0x7fffffff
;
__llvm_amdgcn_buffer_store_bf16
(
*
p_src_thread
,
dst_wave_
config
.
data
,
dst_wave_
buffer_resource
.
data
,
0
,
dst_addr_base
+
dst_thread_addr_offset
+
dst_const_addr_offset
,
dst_addr_base
+
dst_thread_addr_offset
,
false
,
false
);
#endif
#else
if
(
dst_data_valid
)
if
(
dst_
thread_
data_valid
)
{
p_dst_wave
[
dst_thread_data_offset
+
dst_const_data_offset
]
=
*
p_src_thread
;
p_dst_wave
[
dst_thread_data_offset
]
=
*
p_src_thread
;
}
#endif
}
...
...
@@ -966,39 +876,36 @@ template <>
__device__
void
amd_buffer_store
<
ushort
,
2
>
(
const
ushort
*
p_src_thread
,
ushort
*
p_dst_wave
,
index_t
dst_thread_data_offset
,
index_t
dst_const_data_offset
,
bool
dst_data_valid
,
bool
dst_thread_data_valid
,
index_t
dst_data_range
)
{
BufferResourceConstant
<
ushort
>
dst_wave_
config
;
BufferResourceConstant
<
ushort
>
dst_wave_
buffer_resource
;
// wavewise base address (64 bit)
dst_wave_
config
.
address
[
0
]
=
p_dst_wave
;
dst_wave_
buffer_resource
.
address
[
0
]
=
p_dst_wave
;
// wavewise range (32 bit)
dst_wave_
config
.
range
[
2
]
=
dst_data_range
*
sizeof
(
ushort
);
dst_wave_
buffer_resource
.
range
[
2
]
=
dst_data_range
*
sizeof
(
ushort
);
// wavewise setting (32 bit)
dst_wave_config
.
range
[
3
]
=
0x00027000
;
dst_wave_
buffer_resource
.
config
[
3
]
=
0x00027000
;
index_t
dst_thread_addr_offset
=
dst_thread_data_offset
*
sizeof
(
ushort
);
index_t
dst_const_addr_offset
=
dst_const_data_offset
*
sizeof
(
ushort
);
const
float
*
p_src_tmp
=
reinterpret_cast
<
const
float
*>
(
p_src_thread
);
#if !CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK
__llvm_amdgcn_buffer_store_f32
(
*
p_src_tmp
,
dst_wave_
config
.
data
,
dst_wave_
buffer_resource
.
data
,
0
,
dst_data_valid
?
(
dst_thread_addr_offset
+
dst_const_addr_offset
)
:
0xffffffff
,
dst_thread_data_valid
?
dst_thread_addr_offset
:
0xffffffff
,
false
,
false
);
#else
index_t
dst_addr_base
=
dst_data_valid
?
0
:
0x7fffffff
;
index_t
dst_addr_base
=
dst_
thread_
data_valid
?
0
:
0x7fffffff
;
__llvm_amdgcn_buffer_store_f32
(
*
p_src_tmp
,
dst_wave_
config
.
data
,
dst_wave_
buffer_resource
.
data
,
0
,
dst_addr_base
+
dst_thread_addr_offset
+
dst_const_addr_offset
,
dst_addr_base
+
dst_thread_addr_offset
,
false
,
false
);
#endif
...
...
@@ -1008,39 +915,36 @@ template <>
__device__
void
amd_buffer_store
<
ushort
,
4
>
(
const
ushort
*
p_src_thread
,
ushort
*
p_dst_wave
,
index_t
dst_thread_data_offset
,
index_t
dst_const_data_offset
,
bool
dst_data_valid
,
bool
dst_thread_data_valid
,
index_t
dst_data_range
)
{
BufferResourceConstant
<
ushort
>
dst_wave_
config
;
BufferResourceConstant
<
ushort
>
dst_wave_
buffer_resource
;
// wavewise base address (64 bit)
dst_wave_
config
.
address
[
0
]
=
p_dst_wave
;
dst_wave_
buffer_resource
.
address
[
0
]
=
p_dst_wave
;
// wavewise range (32 bit)
dst_wave_
config
.
range
[
2
]
=
dst_data_range
*
sizeof
(
ushort
);
dst_wave_
buffer_resource
.
range
[
2
]
=
dst_data_range
*
sizeof
(
ushort
);
// wavewise setting (32 bit)
dst_wave_config
.
range
[
3
]
=
0x00027000
;
dst_wave_
buffer_resource
.
config
[
3
]
=
0x00027000
;
index_t
dst_thread_addr_offset
=
dst_thread_data_offset
*
sizeof
(
ushort
);
index_t
dst_const_addr_offset
=
dst_const_data_offset
*
sizeof
(
ushort
);
const
float2_t
*
p_src_tmp
=
reinterpret_cast
<
const
float2_t
*>
(
p_src_thread
);
#if !CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK
__llvm_amdgcn_buffer_store_f32x2
(
*
p_src_tmp
,
dst_wave_config
.
data
,
__llvm_amdgcn_buffer_store_f32x2
(
*
p_src_tmp
,
dst_wave_buffer_resource
.
data
,
0
,
dst_data_valid
?
(
dst_thread_addr_offset
+
dst_const_addr_offset
)
:
0xffffffff
,
dst_
thread_
data_valid
?
dst_thread_addr_offset
:
0xffffffff
,
false
,
false
);
#else
index_t
dst_addr_base
=
dst_data_valid
?
0
:
0x7fffffff
;
index_t
dst_addr_base
=
dst_
thread_
data_valid
?
0
:
0x7fffffff
;
__llvm_amdgcn_buffer_store_f32x2
(
*
p_src_tmp
,
dst_wave_
config
.
data
,
dst_wave_
buffer_resource
.
data
,
0
,
dst_addr_base
+
dst_thread_addr_offset
+
dst_const_addr_offset
,
dst_addr_base
+
dst_thread_addr_offset
,
false
,
false
);
#endif
...
...
@@ -1050,39 +954,36 @@ template <>
__device__
void
amd_buffer_store
<
ushort
,
8
>
(
const
ushort
*
p_src_thread
,
ushort
*
p_dst_wave
,
index_t
dst_thread_data_offset
,
index_t
dst_const_data_offset
,
bool
dst_data_valid
,
bool
dst_thread_data_valid
,
index_t
dst_data_range
)
{
BufferResourceConstant
<
ushort
>
dst_wave_
config
;
BufferResourceConstant
<
ushort
>
dst_wave_
buffer_resource
;
// wavewise base address (64 bit)
dst_wave_
config
.
address
[
0
]
=
p_dst_wave
;
dst_wave_
buffer_resource
.
address
[
0
]
=
p_dst_wave
;
// wavewise range (32 bit)
dst_wave_
config
.
range
[
2
]
=
dst_data_range
*
sizeof
(
ushort
);
dst_wave_
buffer_resource
.
range
[
2
]
=
dst_data_range
*
sizeof
(
ushort
);
// wavewise setting (32 bit)
dst_wave_config
.
range
[
3
]
=
0x00027000
;
dst_wave_
buffer_resource
.
config
[
3
]
=
0x00027000
;
index_t
dst_thread_addr_offset
=
dst_thread_data_offset
*
sizeof
(
ushort
);
index_t
dst_const_addr_offset
=
dst_const_data_offset
*
sizeof
(
ushort
);
const
float4_t
*
p_src_tmp
=
reinterpret_cast
<
const
float4_t
*>
(
p_src_thread
);
#if !CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK
__llvm_amdgcn_buffer_store_f32x4
(
*
p_src_tmp
,
dst_wave_config
.
data
,
__llvm_amdgcn_buffer_store_f32x4
(
*
p_src_tmp
,
dst_wave_buffer_resource
.
data
,
0
,
dst_data_valid
?
(
dst_thread_addr_offset
+
dst_const_addr_offset
)
:
0xffffffff
,
dst_
thread_
data_valid
?
dst_thread_addr_offset
:
0xffffffff
,
false
,
false
);
#else
index_t
dst_addr_base
=
dst_data_valid
?
0
:
0x7fffffff
;
index_t
dst_addr_base
=
dst_
thread_
data_valid
?
0
:
0x7fffffff
;
__llvm_amdgcn_buffer_store_f32x4
(
*
p_src_tmp
,
dst_wave_
config
.
data
,
dst_wave_
buffer_resource
.
data
,
0
,
dst_addr_base
+
dst_thread_addr_offset
+
dst_const_addr_offset
,
dst_addr_base
+
dst_thread_addr_offset
,
false
,
false
);
#endif
...
...
@@ -1092,37 +993,33 @@ template <>
__device__
void
amd_buffer_atomic_add
<
float
,
1
>
(
const
float
*
p_src_thread
,
float
*
p_dst_wave
,
index_t
dst_thread_data_offset
,
index_t
dst_const_data_offset
,
bool
dst_data_valid
,
bool
dst_thread_data_valid
,
index_t
dst_data_range
)
{
BufferResourceConstant
<
float
>
dst_wave_
config
;
BufferResourceConstant
<
float
>
dst_wave_
buffer_resource
;
// wavewise base address (64 bit)
dst_wave_
config
.
address
[
0
]
=
p_dst_wave
;
dst_wave_
buffer_resource
.
address
[
0
]
=
p_dst_wave
;
// wavewise range (32 bit)
dst_wave_
config
.
range
[
2
]
=
dst_data_range
*
sizeof
(
float
);
dst_wave_
buffer_resource
.
range
[
2
]
=
dst_data_range
*
sizeof
(
float
);
// wavewise setting (32 bit)
dst_wave_config
.
range
[
3
]
=
0x00027000
;
dst_wave_
buffer_resource
.
config
[
3
]
=
0x00027000
;
index_t
dst_thread_addr_offset
=
dst_thread_data_offset
*
sizeof
(
float
);
index_t
dst_const_addr_offset
=
dst_const_data_offset
*
sizeof
(
float
);
#if !CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK
__llvm_amdgcn_buffer_atomic_add_f32
(
*
p_src_thread
,
dst_wave_config
.
data
,
__llvm_amdgcn_buffer_atomic_add_f32
(
*
p_src_thread
,
dst_wave_buffer_resource
.
data
,
0
,
dst_data_valid
?
(
dst_thread_addr_offset
+
dst_const_addr_offset
)
:
0xffffffff
,
dst_
thread_
data_valid
?
dst_thread_addr_offset
:
0xffffffff
,
false
);
#else
index_t
dst_addr_base
=
dst_data_valid
?
0
:
0x7fffffff
;
index_t
dst_addr_base
=
dst_
thread_
data_valid
?
0
:
0x7fffffff
;
__llvm_amdgcn_buffer_atomic_add_f32
(
*
p_src_thread
,
dst_wave_
config
.
data
,
dst_wave_
buffer_resource
.
data
,
0
,
dst_addr_base
+
dst_thread_addr_offset
+
dst_const_addr_offset
,
dst_addr_base
+
dst_thread_addr_offset
,
false
);
#endif
}
...
...
@@ -1131,43 +1028,40 @@ template <>
__device__
void
amd_buffer_atomic_add
<
float
,
2
>
(
const
float
*
p_src_thread
,
float
*
p_dst_wave
,
index_t
dst_thread_data_offset
,
index_t
dst_const_data_offset
,
bool
dst_data_valid
,
bool
dst_thread_data_valid
,
index_t
dst_data_range
)
{
BufferResourceConstant
<
float
>
dst_wave_
config
;
BufferResourceConstant
<
float
>
dst_wave_
buffer_resource
;
// wavewise base address (64 bit)
dst_wave_
config
.
address
[
0
]
=
p_dst_wave
;
dst_wave_
buffer_resource
.
address
[
0
]
=
p_dst_wave
;
// wavewise range (32 bit)
dst_wave_
config
.
range
[
2
]
=
dst_data_range
;
dst_wave_
buffer_resource
.
range
[
2
]
=
dst_data_range
;
// wavewise setting (32 bit)
dst_wave_config
.
range
[
3
]
=
0x00027000
;
dst_wave_
buffer_resource
.
config
[
3
]
=
0x00027000
;
index_t
dst_thread_addr_offset
=
dst_thread_data_offset
*
sizeof
(
float
);
index_t
dst_const_addr_offset
=
dst_const_data_offset
*
sizeof
(
float
);
#if !CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK
for
(
index_t
i
=
0
;
i
<
2
;
++
i
)
{
__llvm_amdgcn_buffer_atomic_add_f32
(
p_src_thread
[
i
],
dst_wave_
config
.
data
,
dst_wave_
buffer_resource
.
data
,
0
,
dst_data_valid
?
(
dst_thread_addr_offset
+
dst_const_addr_offset
+
i
*
sizeof
(
float
))
:
0xffffffff
,
dst_thread_data_valid
?
(
dst_thread_addr_offset
+
i
*
sizeof
(
float
))
:
0xffffffff
,
false
);
}
#else
index_t
dst_addr_base
=
dst_data_valid
?
0
:
0x7fffffff
;
index_t
dst_addr_base
=
dst_
thread_
data_valid
?
0
:
0x7fffffff
;
for
(
index_t
i
=
0
;
i
<
2
;
++
i
)
{
__llvm_amdgcn_buffer_atomic_add_f32
(
p_src_thread
[
i
],
dst_wave_
config
.
data
,
dst_wave_
buffer_resource
.
data
,
0
,
dst_addr_base
+
dst_thread_addr_offset
+
dst_const_addr_offset
+
i
*
sizeof
(
float
),
i
*
sizeof
(
float
),
false
);
}
#endif
...
...
@@ -1177,43 +1071,40 @@ template <>
__device__
void
amd_buffer_atomic_add
<
float
,
4
>
(
const
float
*
p_src_thread
,
float
*
p_dst_wave
,
index_t
dst_thread_data_offset
,
index_t
dst_const_data_offset
,
bool
dst_data_valid
,
bool
dst_thread_data_valid
,
index_t
dst_data_range
)
{
BufferResourceConstant
<
float
>
dst_wave_
config
;
BufferResourceConstant
<
float
>
dst_wave_
buffer_resource
;
// wavewise base address (64 bit)
dst_wave_
config
.
address
[
0
]
=
p_dst_wave
;
dst_wave_
buffer_resource
.
address
[
0
]
=
p_dst_wave
;
// wavewise range (32 bit)
dst_wave_
config
.
range
[
2
]
=
dst_data_range
*
sizeof
(
float
);
dst_wave_
buffer_resource
.
range
[
2
]
=
dst_data_range
*
sizeof
(
float
);
// wavewise setting (32 bit)
dst_wave_config
.
range
[
3
]
=
0x00027000
;
dst_wave_
buffer_resource
.
config
[
3
]
=
0x00027000
;
index_t
dst_thread_addr_offset
=
dst_thread_data_offset
*
sizeof
(
float
);
index_t
dst_const_addr_offset
=
dst_const_data_offset
*
sizeof
(
float
);
#if !CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK
for
(
index_t
i
=
0
;
i
<
4
;
++
i
)
{
__llvm_amdgcn_buffer_atomic_add_f32
(
p_src_thread
[
i
],
dst_wave_
config
.
data
,
dst_wave_
buffer_resource
.
data
,
0
,
dst_data_valid
?
(
dst_thread_addr_offset
+
dst_const_addr_offset
+
i
*
sizeof
(
float
))
:
0xffffffff
,
dst_thread_data_valid
?
(
dst_thread_addr_offset
+
i
*
sizeof
(
float
))
:
0xffffffff
,
false
);
}
#else
index_t
dst_addr_base
=
dst_data_valid
?
0
:
0x7fffffff
;
index_t
dst_addr_base
=
dst_
thread_
data_valid
?
0
:
0x7fffffff
;
for
(
index_t
i
=
0
;
i
<
4
;
++
i
)
{
__llvm_amdgcn_buffer_atomic_add_f32
(
p_src_thread
[
i
],
dst_wave_
config
.
data
,
dst_wave_
buffer_resource
.
data
,
0
,
dst_addr_base
+
dst_thread_addr_offset
+
dst_const_addr_offset
+
i
*
sizeof
(
float
),
i
*
sizeof
(
float
),
false
);
}
#endif
...
...
composable_kernel/include/utility/in_memory_operation.amd.hpp.in
View file @
435f5f91
...
...
@@ -72,8 +72,8 @@ struct SetData
#if CK_USE_AMD_BUFFER_ADDRESSING
// buffer_load requires:
// 1) p_src must be in global memory space,
d
_dst must be vgpr
// 2) p_src to be a
block-invariant
pointer.
// 1) p_src
_thread
must be in global memory space,
p
_dst
_thread
must be vgpr
// 2) p_src
_thread
to be a
wavewise
pointer.
// It is user's responsibility to make sure that is true.
template <>
__device__ void Run<AddressSpace::Global, AddressSpace::Vgpr>(const T* p_src,
...
...
@@ -88,13 +88,13 @@ struct SetData
if(dst_valid)
{
*reinterpret_cast<vector_t*>(&p_dst[dst_offset]) =
amd_buffer_load<T, DataPerAccess>(p_src, src_offset,
0,
src_valid, src_range);
amd_buffer_load<T, DataPerAccess>(p_src, src_offset, src_valid, src_range);
}
}
// buffer_store requires:
// 1) p_src must be in vgpr space,
d
_dst must be global memory
// 2) p_dst to be a
block-invariant
pointer.
// 1) p_src
_thread
must be in vgpr space,
p
_dst
_thread
must be global memory
// 2) p_dst
_thread
to be a
wavewise
pointer.
// It is user's responsibility to make sure that is true.
template <>
__device__ void Run<AddressSpace::Vgpr, AddressSpace::Global>(const T* p_src,
...
...
@@ -109,7 +109,7 @@ struct SetData
const auto zeros = vector_t(0);
amd_buffer_store<T, DataPerAccess>(
src_valid ? &(p_src[src_offset]) : &zeros, p_dst, dst_offset,
0,
dst_valid, dst_range);
src_valid ? &(p_src[src_offset]) : &zeros, p_dst, dst_offset, dst_valid, dst_range);
}
#endif
};
...
...
@@ -138,9 +138,9 @@ struct AtomicAddData
}
#if CK_USE_AMD_BUFFER_ADDRESSING && CK_USE_AMD_BUFFER_ATOMIC_ADD
// buffer_atomic
_add
requires:
// 1) p_src must be in vgpr space,
d
_dst must be global memory
// 2) p_dst to be a
block-invariant
pointer.
// buffer_atomic requires:
// 1) p_src
_thread
must be in vgpr space,
p
_dst
_thread
must be global memory
// 2) p_dst
_thread
to be a
wavewise
pointer.
// It is user's responsibility to make sure that is true.
template <>
__device__ void Run<AddressSpace::Vgpr, AddressSpace::Global>(const T* p_src,
...
...
@@ -156,7 +156,6 @@ struct AtomicAddData
amd_buffer_atomic_add<T, DataPerAccess>(src_valid ? &(p_src[src_offset]) : &zeros,
p_dst,
dst_offset,
0,
dst_valid,
index_t dst_range);
}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment