Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
079b745d
Commit
079b745d
authored
Feb 20, 2021
by
Chao Liu
Browse files
refactor buffer load/store
parent
f6ec737c
Changes
5
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
317 additions
and
218 deletions
+317
-218
composable_kernel/include/tensor_operation/threadwise_dynamic_tensor_slice_transfer.hpp
...or_operation/threadwise_dynamic_tensor_slice_transfer.hpp
+9
-40
composable_kernel/include/utility/amd_buffer_addressing.hpp
composable_kernel/include/utility/amd_buffer_addressing.hpp
+25
-172
composable_kernel/include/utility/amd_buffer_addressing_v2.hpp
...sable_kernel/include/utility/amd_buffer_addressing_v2.hpp
+276
-0
composable_kernel/include/utility/config.amd.hpp.in
composable_kernel/include/utility/config.amd.hpp.in
+6
-6
composable_kernel/include/utility/in_memory_operation.amd.hpp.in
...ble_kernel/include/utility/in_memory_operation.amd.hpp.in
+1
-0
No files found.
composable_kernel/include/tensor_operation/threadwise_dynamic_tensor_slice_transfer.hpp
View file @
079b745d
...
...
@@ -178,7 +178,6 @@ struct ThreadwiseDynamicTensorSliceTransfer_v1r3
dst_vector
(
i
)
=
p_src
[
Number
<
src_offset
>
{}];
});
#if 1
amd_buffer_store_v2
<
DstData
,
DstScalarPerVector
>
(
dst_vector
.
Vector
(),
p_dst
,
...
...
@@ -186,17 +185,6 @@ struct ThreadwiseDynamicTensorSliceTransfer_v1r3
coordinate_has_valid_offset_assuming_visible_index_is_valid
(
dst_desc
,
dst_slice_origin_coord_
),
dst_desc
.
GetElementSpaceSize
());
#else
static_for
<
0
,
DstScalarPerVector
,
1
>
{}([
&
](
auto
i
)
{
amd_buffer_store_v2
<
DstData
,
1
>
(
dst_vector
[
i
],
p_dst
,
dst_slice_origin_coord_
.
GetOffset
()
+
i
.
value
,
coordinate_has_valid_offset_assuming_visible_index_is_valid
(
dst_desc
,
dst_slice_origin_coord_
),
dst_desc
.
GetElementSpaceSize
());
});
#endif
constexpr
auto
move_on_dim
=
[
&
]()
constexpr
{
...
...
@@ -480,16 +468,14 @@ struct ThreadwiseDynamicTensorSliceTransfer_v2
if
constexpr
(
SrcAddressSpace
==
AddressSpace
::
Global
)
{
src_vector
.
Vector
()
=
amd_buffer_load
<
SrcData
,
SrcScalarPerVector
>
(
p_src
,
src_slice_origin_coord_
.
GetOffset
(),
true
,
src_desc
.
GetElementSpaceSize
());
const
bool
is_valid
=
coordinate_has_valid_offset_assuming_visible_index_is_valid
(
src_desc
,
src_slice_origin_coord_
);
src_vector
.
Vector
()
=
is_valid
?
src_vector
.
Vector
()
:
src_vector_t
{
0
};
src_vector
.
Vector
()
=
amd_buffer_load_v2
<
SrcData
,
SrcScalarPerVector
>
(
p_src
,
src_slice_origin_coord_
.
GetOffset
(),
is_valid
,
src_desc
.
GetElementSpaceSize
());
}
else
{
...
...
@@ -816,30 +802,14 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
using
src_vector_t
=
typename
vector_type
<
SrcData
,
SrcScalarPerVector
>::
MemoryType
;
#if 1
src_vector
.
Vector
()
=
amd_buffer_load
<
SrcData
,
SrcScalarPerVector
>
(
p_src
,
src_slice_origin_coord_
.
GetOffset
(),
true
,
src_desc
.
GetElementSpaceSize
());
const
bool
is_valid
=
coordinate_has_valid_offset_assuming_visible_index_is_valid
(
src_desc
,
src_slice_origin_coord_
);
src_vector
.
Vector
()
=
is_valid
?
src_vector
.
Vector
()
:
src_vector_t
{
0
};
static_for
<
0
,
SrcScalarPerVector
,
1
>
{}([
&
](
auto
i
)
{
constexpr
index_t
buffer_offset
=
buffer_desc_
.
CalculateOffset
(
src_data_idx
+
i
*
src_scalar_step_in_vector
);
buffer_
(
Number
<
buffer_offset
>
{})
=
src_vector
[
i
];
});
#else
const
bool
is_valid
=
coordinate_has_valid_offset_assuming_visible_index_is_valid
(
src_desc
,
src_slice_origin_coord_
);
src_vector
.
Vector
()
=
amd_buffer_load
<
SrcData
,
SrcScalarPerVector
>
(
p_src
,
src_slice_origin_coord_
.
GetOffset
(),
is_valid
,
src_desc
.
GetElementSpaceSize
());
amd_buffer_load
_v2
<
SrcData
,
SrcScalarPerVector
>
(
p_src
,
src_slice_origin_coord_
.
GetOffset
(),
is_valid
,
src_desc
.
GetElementSpaceSize
());
static_for
<
0
,
SrcScalarPerVector
,
1
>
{}([
&
](
auto
i
)
{
constexpr
index_t
buffer_offset
=
...
...
@@ -847,7 +817,6 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
buffer_
(
Number
<
buffer_offset
>
{})
=
src_vector
[
i
];
});
#endif
constexpr
auto
move_on_dim
=
[
&
]()
constexpr
{
...
...
composable_kernel/include/utility/amd_buffer_addressing.hpp
View file @
079b745d
...
...
@@ -118,18 +118,6 @@ __device__ void amd_buffer_store(const T* p_src_thread,
bool
dst_thread_data_valid
,
index_t
dst_data_range
);
// buffer_store requires:
// 1) p_dst_wave must be global memory
// 2) p_dst_wave to be a wavewise pointer.
// It is user's responsibility to make sure that is true.
template
<
typename
T
,
index_t
VectorSize
>
__device__
void
amd_buffer_store_v2
(
const
typename
vector_type
<
T
,
VectorSize
>::
MemoryType
src_thread_data
,
T
*
p_dst_wave
,
const
index_t
dst_thread_data_offset
,
const
bool
dst_thread_data_valid
,
const
index_t
dst_data_range
);
// buffer_atomic requires:
// 1) p_src_thread must be in vgpr space, p_dst_thread must be global memory
// 2) p_dst_thread to be a wavewise pointer.
...
...
@@ -158,21 +146,16 @@ __device__ float amd_buffer_load<float, 1>(const float* p_src_wave,
index_t
src_thread_addr_offset
=
src_thread_data_offset
*
sizeof
(
float
);
#if CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK
#if CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK
_OFFSET_TRICK
uint32_t
src_addr_shift
=
src_thread_data_valid
?
0
:
0x7fffffff
;
return
__llvm_amdgcn_buffer_load_f32
(
src_wave_buffer_resource
.
data
,
0
,
src_addr_shift
+
src_thread_addr_offset
,
false
,
false
);
#else
#if 0 // debug
float
tmp
=
__llvm_amdgcn_buffer_load_f32
(
src_wave_buffer_resource
.
data
,
0
,
src_thread_addr_offset
,
false
,
false
);
return
src_thread_data_valid
?
tmp
:
float
(
0
);
#else
return
__llvm_amdgcn_buffer_load_f32
(
src_wave_buffer_resource
.
data
,
0
,
src_thread_addr_offset
,
false
,
false
);
#endif
#endif
}
...
...
@@ -193,21 +176,16 @@ __device__ float2_t amd_buffer_load<float, 2>(const float* p_src_wave,
index_t
src_thread_addr_offset
=
src_thread_data_offset
*
sizeof
(
float
);
#if CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK
#if CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK
_OFFSET_TRICK
uint32_t
src_addr_shift
=
src_thread_data_valid
?
0
:
0x7fffffff
;
return
__llvm_amdgcn_buffer_load_f32x2
(
src_wave_buffer_resource
.
data
,
0
,
src_addr_shift
+
src_thread_addr_offset
,
false
,
false
);
#else
#if 0
float2_t
tmp
=
__llvm_amdgcn_buffer_load_f32x2
(
src_wave_buffer_resource
.
data
,
0
,
src_thread_addr_offset
,
false
,
false
);
return
src_thread_data_valid
?
tmp
:
float2_t
(
0
);
#else
return
__llvm_amdgcn_buffer_load_f32x2
(
src_wave_buffer_resource
.
data
,
0
,
src_thread_addr_offset
,
false
,
false
);
#endif
#endif
}
...
...
@@ -228,21 +206,16 @@ __device__ float4_t amd_buffer_load<float, 4>(const float* p_src_wave,
index_t
src_thread_addr_offset
=
src_thread_data_offset
*
sizeof
(
float
);
#if CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK
#if CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK
_OFFSET_TRICK
uint32_t
src_addr_shift
=
src_thread_data_valid
?
0
:
0x7fffffff
;
return
__llvm_amdgcn_buffer_load_f32x4
(
src_wave_buffer_resource
.
data
,
0
,
src_addr_shift
+
src_thread_addr_offset
,
false
,
false
);
#else
#if 0
float4_t
tmp
=
__llvm_amdgcn_buffer_load_f32x4
(
src_wave_buffer_resource
.
data
,
0
,
src_thread_addr_offset
,
false
,
false
);
return
src_thread_data_valid
?
tmp
:
float4_t
(
0
);
#else
return
__llvm_amdgcn_buffer_load_f32x4
(
src_wave_buffer_resource
.
data
,
0
,
src_thread_addr_offset
,
false
,
false
);
#endif
#endif
}
...
...
@@ -263,7 +236,7 @@ __device__ half_t amd_buffer_load<half_t, 1>(const half_t* p_src_wave,
index_t
src_thread_addr_offset
=
src_thread_data_offset
*
sizeof
(
half_t
);
#if CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK
#if CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK
_OFFSET_TRICK
uint32_t
src_addr_shift
=
src_thread_data_valid
?
0
:
0x7fffffff
;
// current code cannot isolate Soffset and Voffset, so Soffset is hard-coded to 0, and
...
...
@@ -298,7 +271,7 @@ __device__ half2_t amd_buffer_load<half_t, 2>(const half_t* p_src_wave,
index_t
src_thread_addr_offset
=
src_thread_data_offset
*
sizeof
(
half_t
);
#if CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK
#if CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK
_OFFSET_TRICK
uint32_t
src_addr_shift
=
src_thread_data_valid
?
0
:
0x7fffffff
;
float
dst_out_tmp
=
__llvm_amdgcn_buffer_load_f32
(
...
...
@@ -332,7 +305,7 @@ __device__ half4_t amd_buffer_load<half_t, 4>(const half_t* p_src_wave,
index_t
src_thread_addr_offset
=
src_thread_data_offset
*
sizeof
(
half_t
);
#if CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK
#if CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK
_OFFSET_TRICK
uint32_t
src_addr_shift
=
src_thread_data_valid
?
0
:
0x7fffffff
;
float2_t
dst_out_tmp
=
__llvm_amdgcn_buffer_load_f32x2
(
...
...
@@ -366,7 +339,7 @@ __device__ half8_t amd_buffer_load<half_t, 8>(const half_t* p_src_wave,
index_t
src_thread_addr_offset
=
src_thread_data_offset
*
sizeof
(
half_t
);
#if CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK
#if CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK
_OFFSET_TRICK
uint32_t
src_addr_shift
=
src_thread_data_valid
?
0
:
0x7fffffff
;
float4_t
dst_out_tmp
=
__llvm_amdgcn_buffer_load_f32x4
(
...
...
@@ -400,7 +373,7 @@ __device__ ushort amd_buffer_load<ushort, 1>(const ushort* p_src_wave,
index_t
src_thread_addr_offset
=
src_thread_data_offset
*
sizeof
(
ushort
);
#if CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK
#if CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK
_OFFSET_TRICK
uint32_t
src_addr_shift
=
src_thread_data_valid
?
0
:
0x7fffffff
;
// current code cannot isolate Soffset and Voffset, so Soffset is hard-coded to 0, and
...
...
@@ -435,7 +408,7 @@ __device__ ushort2_t amd_buffer_load<ushort, 2>(const ushort* p_src_wave,
index_t
src_thread_addr_offset
=
src_thread_data_offset
*
sizeof
(
ushort
);
#if CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK
#if CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK
_OFFSET_TRICK
uint32_t
src_addr_shift
=
src_thread_data_valid
?
0
:
0x7fffffff
;
float
dst_out_tmp
=
__llvm_amdgcn_buffer_load_f32
(
...
...
@@ -469,7 +442,7 @@ __device__ ushort4_t amd_buffer_load<ushort, 4>(const ushort* p_src_wave,
index_t
src_thread_addr_offset
=
src_thread_data_offset
*
sizeof
(
ushort
);
#if CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK
#if CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK
_OFFSET_TRICK
uint32_t
src_addr_shift
=
src_thread_data_valid
?
0
:
0x7fffffff
;
float2_t
dst_out_tmp
=
__llvm_amdgcn_buffer_load_f32x2
(
...
...
@@ -503,7 +476,7 @@ __device__ ushort8_t amd_buffer_load<ushort, 8>(const ushort* p_src_wave,
index_t
src_thread_addr_offset
=
src_thread_data_offset
*
sizeof
(
ushort
);
#if CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK
#if CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK
_OFFSET_TRICK
uint32_t
src_addr_shift
=
src_thread_data_valid
?
0
:
0x7fffffff
;
float4_t
dst_out_tmp
=
__llvm_amdgcn_buffer_load_f32x4
(
...
...
@@ -538,7 +511,7 @@ __device__ void amd_buffer_store<float, 1>(const float* p_src_thread,
index_t
dst_thread_addr_offset
=
dst_thread_data_offset
*
sizeof
(
float
);
#if CK_EXPERIMENTAL_USE_BUFFER_STORE_OOB_CHECK
#if CK_EXPERIMENTAL_USE_BUFFER_STORE_OOB_CHECK
_OFFSET_TRICK
uint32_t
dst_addr_shift
=
dst_thread_data_valid
?
0
:
0x7fffffff
;
__llvm_amdgcn_buffer_store_f32
(
*
p_src_thread
,
...
...
@@ -574,7 +547,7 @@ __device__ void amd_buffer_store<float, 2>(const float* p_src_thread,
index_t
dst_thread_addr_offset
=
dst_thread_data_offset
*
sizeof
(
float
);
#if CK_EXPERIMENTAL_USE_BUFFER_STORE_OOB_CHECK
#if CK_EXPERIMENTAL_USE_BUFFER_STORE_OOB_CHECK
_OFFSET_TRICK
uint32_t
dst_addr_shift
=
dst_thread_data_valid
?
0
:
0x7fffffff
;
__llvm_amdgcn_buffer_store_f32x2
(
*
reinterpret_cast
<
const
float2_t
*>
(
p_src_thread
),
...
...
@@ -614,7 +587,7 @@ __device__ void amd_buffer_store<float, 4>(const float* p_src_thread,
index_t
dst_thread_addr_offset
=
dst_thread_data_offset
*
sizeof
(
float
);
#if CK_EXPERIMENTAL_USE_BUFFER_STORE_OOB_CHECK
#if CK_EXPERIMENTAL_USE_BUFFER_STORE_OOB_CHECK
_OFFSET_TRICK
uint32_t
dst_addr_shift
=
dst_thread_data_valid
?
0
:
0x7fffffff
;
__llvm_amdgcn_buffer_store_f32x4
(
*
reinterpret_cast
<
const
float4_t
*>
(
p_src_thread
),
...
...
@@ -654,7 +627,7 @@ __device__ void amd_buffer_store<half_t, 1>(const half_t* p_src_thread,
index_t
dst_thread_addr_offset
=
dst_thread_data_offset
*
sizeof
(
half_t
);
#if CK_EXPERIMENTAL_USE_BUFFER_STORE_OOB_CHECK
#if CK_EXPERIMENTAL_USE_BUFFER_STORE_OOB_CHECK
_OFFSET_TRICK
uint32_t
dst_addr_shift
=
dst_thread_data_valid
?
0
:
0x7fffffff
;
// current code cannot isolate Soffset and Voffset, so Soffset is hard-coded to 0, and
...
...
@@ -695,7 +668,7 @@ __device__ void amd_buffer_store<half_t, 2>(const half_t* p_src_thread,
const
float
*
p_src_tmp
=
reinterpret_cast
<
const
float
*>
(
p_src_thread
);
#if CK_EXPERIMENTAL_USE_BUFFER_STORE_OOB_CHECK
#if CK_EXPERIMENTAL_USE_BUFFER_STORE_OOB_CHECK
_OFFSET_TRICK
uint32_t
dst_addr_shift
=
dst_thread_data_valid
?
0
:
0x7fffffff
;
__llvm_amdgcn_buffer_store_f32
(
*
p_src_tmp
,
...
...
@@ -733,7 +706,7 @@ __device__ void amd_buffer_store<half_t, 4>(const half_t* p_src_thread,
const
float2_t
*
p_src_tmp
=
reinterpret_cast
<
const
float2_t
*>
(
p_src_thread
);
#if CK_EXPERIMENTAL_USE_BUFFER_STORE_OOB_CHECK
#if CK_EXPERIMENTAL_USE_BUFFER_STORE_OOB_CHECK
_OFFSET_TRICK
uint32_t
dst_addr_shift
=
dst_thread_data_valid
?
0
:
0x7fffffff
;
__llvm_amdgcn_buffer_store_f32x2
(
*
p_src_tmp
,
...
...
@@ -771,7 +744,7 @@ __device__ void amd_buffer_store<half_t, 8>(const half_t* p_src_thread,
const
float4_t
*
p_src_tmp
=
reinterpret_cast
<
const
float4_t
*>
(
p_src_thread
);
#if CK_EXPERIMENTAL_USE_BUFFER_STORE_OOB_CHECK
#if CK_EXPERIMENTAL_USE_BUFFER_STORE_OOB_CHECK
_OFFSET_TRICK
uint32_t
dst_addr_shift
=
dst_thread_data_valid
?
0
:
0x7fffffff
;
__llvm_amdgcn_buffer_store_f32x4
(
*
p_src_tmp
,
...
...
@@ -807,7 +780,7 @@ __device__ void amd_buffer_store<ushort, 1>(const ushort* p_src_thread,
index_t
dst_thread_addr_offset
=
dst_thread_data_offset
*
sizeof
(
ushort
);
#if CK_EXPERIMENTAL_USE_BUFFER_STORE_OOB_CHECK
#if CK_EXPERIMENTAL_USE_BUFFER_STORE_OOB_CHECK
_OFFSET_TRICK
uint32_t
dst_addr_shift
=
dst_thread_data_valid
?
0
:
0x7fffffff
;
__llvm_amdgcn_raw_buffer_store_bf16
(
*
p_src_thread
,
...
...
@@ -844,7 +817,7 @@ __device__ void amd_buffer_store<ushort, 2>(const ushort* p_src_thread,
const
float
*
p_src_tmp
=
reinterpret_cast
<
const
float
*>
(
p_src_thread
);
#if CK_EXPERIMENTAL_USE_BUFFER_STORE_OOB_CHECK
#if CK_EXPERIMENTAL_USE_BUFFER_STORE_OOB_CHECK
_OFFSET_TRICK
uint32_t
dst_addr_shift
=
dst_thread_data_valid
?
0
:
0x7fffffff
;
__llvm_amdgcn_buffer_store_f32
(
*
p_src_tmp
,
...
...
@@ -882,7 +855,7 @@ __device__ void amd_buffer_store<ushort, 4>(const ushort* p_src_thread,
const
float2_t
*
p_src_tmp
=
reinterpret_cast
<
const
float2_t
*>
(
p_src_thread
);
#if CK_EXPERIMENTAL_USE_BUFFER_STORE_OOB_CHECK
#if CK_EXPERIMENTAL_USE_BUFFER_STORE_OOB_CHECK
_OFFSET_TRICK
uint32_t
dst_addr_shift
=
dst_thread_data_valid
?
0
:
0x7fffffff
;
__llvm_amdgcn_buffer_store_f32x2
(
*
p_src_tmp
,
...
...
@@ -920,7 +893,7 @@ __device__ void amd_buffer_store<ushort, 8>(const ushort* p_src_thread,
const
float4_t
*
p_src_tmp
=
reinterpret_cast
<
const
float4_t
*>
(
p_src_thread
);
#if CK_EXPERIMENTAL_USE_BUFFER_STORE_OOB_CHECK
#if CK_EXPERIMENTAL_USE_BUFFER_STORE_OOB_CHECK
_OFFSET_TRICK
uint32_t
dst_addr_shift
=
dst_thread_data_valid
?
0
:
0x7fffffff
;
__llvm_amdgcn_buffer_store_f32x4
(
*
p_src_tmp
,
...
...
@@ -938,126 +911,6 @@ __device__ void amd_buffer_store<ushort, 8>(const ushort* p_src_thread,
#endif
}
template
<
>
__device__
void
amd_buffer_store_v2
<
float
,
1
>
(
const
float
src_thread_data
,
float
*
p_dst_wave
,
const
index_t
dst_thread_data_offset
,
const
bool
dst_thread_data_valid
,
const
index_t
dst_data_range
)
{
BufferResourceConstant
<
float
>
dst_wave_buffer_resource
;
// wavewise base address (64 bit)
dst_wave_buffer_resource
.
address
[
0
]
=
p_dst_wave
;
// wavewise range (32 bit)
dst_wave_buffer_resource
.
range
[
2
]
=
dst_data_range
*
sizeof
(
float
);
// wavewise setting (32 bit)
dst_wave_buffer_resource
.
config
[
3
]
=
0x00027000
;
index_t
dst_thread_addr_offset
=
dst_thread_data_offset
*
sizeof
(
float
);
#if CK_EXPERIMENTAL_USE_BUFFER_STORE_OOB_CHECK
uint32_t
dst_addr_shift
=
dst_thread_data_valid
?
0
:
0x7fffffff
;
__llvm_amdgcn_buffer_store_f32
(
src_thread_data
,
dst_wave_buffer_resource
.
data
,
0
,
dst_addr_shift
+
dst_thread_addr_offset
,
false
,
false
);
#else
if
(
dst_thread_data_valid
)
{
__llvm_amdgcn_buffer_store_f32
(
src_thread_data
,
dst_wave_buffer_resource
.
data
,
0
,
dst_thread_addr_offset
,
false
,
false
);
}
#endif
}
template
<
>
__device__
void
amd_buffer_store_v2
<
float
,
2
>
(
const
float2_t
src_thread_data
,
float
*
p_dst_wave
,
const
index_t
dst_thread_data_offset
,
const
bool
dst_thread_data_valid
,
const
index_t
dst_data_range
)
{
BufferResourceConstant
<
float
>
dst_wave_buffer_resource
;
// wavewise base address (64 bit)
dst_wave_buffer_resource
.
address
[
0
]
=
p_dst_wave
;
// wavewise range (32 bit)
dst_wave_buffer_resource
.
range
[
2
]
=
dst_data_range
*
sizeof
(
float
);
// wavewise setting (32 bit)
dst_wave_buffer_resource
.
config
[
3
]
=
0x00027000
;
index_t
dst_thread_addr_offset
=
dst_thread_data_offset
*
sizeof
(
float
);
#if CK_EXPERIMENTAL_USE_BUFFER_STORE_OOB_CHECK
uint32_t
dst_addr_shift
=
dst_thread_data_valid
?
0
:
0x7fffffff
;
__llvm_amdgcn_buffer_store_f32x2
(
src_thread_data
,
dst_wave_buffer_resource
.
data
,
0
,
dst_addr_shift
+
dst_thread_addr_offset
,
false
,
false
);
#else
if
(
dst_thread_data_valid
)
{
__llvm_amdgcn_buffer_store_f32x2
(
src_thread_data
,
dst_wave_buffer_resource
.
data
,
0
,
dst_thread_addr_offset
,
false
,
false
);
}
#endif
}
template
<
>
__device__
void
amd_buffer_store_v2
<
float
,
4
>
(
const
float4_t
src_thread_data
,
float
*
p_dst_wave
,
const
index_t
dst_thread_data_offset
,
const
bool
dst_thread_data_valid
,
const
index_t
dst_data_range
)
{
BufferResourceConstant
<
float
>
dst_wave_buffer_resource
;
// wavewise base address (64 bit)
dst_wave_buffer_resource
.
address
[
0
]
=
p_dst_wave
;
// wavewise range (32 bit)
dst_wave_buffer_resource
.
range
[
2
]
=
dst_data_range
*
sizeof
(
float
);
// wavewise setting (32 bit)
dst_wave_buffer_resource
.
config
[
3
]
=
0x00027000
;
index_t
dst_thread_addr_offset
=
dst_thread_data_offset
*
sizeof
(
float
);
#if CK_EXPERIMENTAL_USE_BUFFER_STORE_OOB_CHECK
uint32_t
dst_addr_shift
=
dst_thread_data_valid
?
0
:
0x7fffffff
;
__llvm_amdgcn_buffer_store_f32x4
(
src_thread_data
,
dst_wave_buffer_resource
.
data
,
0
,
dst_addr_shift
+
dst_thread_addr_offset
,
false
,
false
);
#else
if
(
dst_thread_data_valid
)
{
__llvm_amdgcn_buffer_store_f32x4
(
src_thread_data
,
dst_wave_buffer_resource
.
data
,
0
,
dst_thread_addr_offset
,
false
,
false
);
}
#endif
}
#if CK_USE_AMD_BUFFER_ATOMIC_FADD
template
<
>
__device__
void
amd_buffer_atomic_add
<
float
,
1
>
(
const
float
*
p_src_thread
,
...
...
@@ -1077,7 +930,7 @@ __device__ void amd_buffer_atomic_add<float, 1>(const float* p_src_thread,
index_t
dst_thread_addr_offset
=
dst_thread_data_offset
*
sizeof
(
float
);
#if CK_EXPERIMENTAL_USE_BUFFER_ATOMIC_OOB_CHECK
#if CK_EXPERIMENTAL_USE_BUFFER_ATOMIC_OOB_CHECK
_OFFSET_TRICK
uint32_t
dst_addr_shift
=
dst_thread_data_valid
?
0
:
0x7fffffff
;
__llvm_amdgcn_buffer_atomic_add_f32
(
*
p_src_thread
,
...
...
@@ -1112,7 +965,7 @@ __device__ void amd_buffer_atomic_add<float, 2>(const float* p_src_thread,
index_t
dst_thread_addr_offset
=
dst_thread_data_offset
*
sizeof
(
float
);
#if CK_EXPERIMENTAL_USE_BUFFER_ATOMIC_OOB_CHECK
#if CK_EXPERIMENTAL_USE_BUFFER_ATOMIC_OOB_CHECK
_OFFSET_TRICK
uint32_t
dst_addr_shift
=
dst_thread_data_valid
?
0
:
0x7fffffff
;
for
(
index_t
i
=
0
;
i
<
2
;
++
i
)
...
...
@@ -1157,7 +1010,7 @@ __device__ void amd_buffer_atomic_add<float, 4>(const float* p_src_thread,
index_t
dst_thread_addr_offset
=
dst_thread_data_offset
*
sizeof
(
float
);
#if CK_EXPERIMENTAL_USE_BUFFER_ATOMIC_OOB_CHECK
#if CK_EXPERIMENTAL_USE_BUFFER_ATOMIC_OOB_CHECK
_OFFSET_TRICK
uint32_t
dst_addr_shift
=
dst_thread_data_valid
?
0
:
0x7fffffff
;
for
(
index_t
i
=
0
;
i
<
4
;
++
i
)
...
...
composable_kernel/include/utility/amd_buffer_addressing_v2.hpp
0 → 100644
View file @
079b745d
#ifndef CK_AMD_BUFFER_ADDRESSING_V2_HPP
#define CK_AMD_BUFFER_ADDRESSING_V2_HPP
#include "float_type.hpp"
#include "amd_buffer_addressing.hpp"
namespace
ck
{
#if 0
// For 128 bit SGPRs to supply resource constant in buffer instructions
// https://rocm-documentation.readthedocs.io/en/latest/GCN_ISA_Manuals/testdocbook.html#vector-memory-buffer-instructions
template <typename T>
union BufferResourceConstant
{
int32x4_t data;
T* address[2];
int32_t range[4];
int32_t config[4];
};
#endif
__device__
float
__llvm_amdgcn_buffer_load_f32
(
int32x4_t
srsrc
,
index_t
vindex
,
index_t
offset
,
bool
glc
,
bool
slc
)
__asm
(
"llvm.amdgcn.buffer.load.f32"
);
__device__
float2_t
__llvm_amdgcn_buffer_load_f32x2
(
int32x4_t
srsrc
,
index_t
vindex
,
index_t
offset
,
bool
glc
,
bool
slc
)
__asm
(
"llvm.amdgcn.buffer.load.v2f32"
);
__device__
float4_t
__llvm_amdgcn_buffer_load_f32x4
(
int32x4_t
srsrc
,
index_t
vindex
,
index_t
offset
,
bool
glc
,
bool
slc
)
__asm
(
"llvm.amdgcn.buffer.load.v4f32"
);
// buffer_load requires:
// 1) p_src_wave must be in global memory space
// 2) p_src_wave to be a wavewise pointer.
// It is user's responsibility to make sure that is true.
template
<
typename
T
,
index_t
VectorSize
>
__device__
typename
vector_type
<
T
,
VectorSize
>::
MemoryType
amd_buffer_load_v2
(
const
T
*
p_src_wave
,
index_t
src_thread_data_offset
,
bool
src_thread_data_valid
,
index_t
src_elemenst_space
);
// buffer_store requires:
// 1) p_dst_wave must be global memory
// 2) p_dst_wave to be a wavewise pointer.
// It is user's responsibility to make sure that is true.
template
<
typename
T
,
index_t
VectorSize
>
__device__
void
amd_buffer_store_v2
(
const
typename
vector_type
<
T
,
VectorSize
>::
MemoryType
src_thread_data
,
T
*
p_dst_wave
,
const
index_t
dst_thread_data_offset
,
const
bool
dst_thread_data_valid
,
const
index_t
dst_data_range
);
template
<
>
__device__
float
amd_buffer_load_v2
<
float
,
1
>
(
const
float
*
p_src_wave
,
index_t
src_thread_data_offset
,
bool
src_thread_data_valid
,
index_t
src_data_range
)
{
BufferResourceConstant
<
float
>
src_wave_buffer_resource
;
// wavewise base address (64 bit)
src_wave_buffer_resource
.
address
[
0
]
=
const_cast
<
float
*>
(
p_src_wave
);
// wavewise range (32 bit)
src_wave_buffer_resource
.
range
[
2
]
=
src_data_range
*
sizeof
(
float
);
// wavewise setting (32 bit)
src_wave_buffer_resource
.
config
[
3
]
=
0x00027000
;
index_t
src_thread_addr_offset
=
src_thread_data_offset
*
sizeof
(
float
);
#if CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK
uint32_t
src_addr_shift
=
src_thread_data_valid
?
0
:
0x7fffffff
;
return
__llvm_amdgcn_buffer_load_f32
(
src_wave_buffer_resource
.
data
,
0
,
src_addr_shift
+
src_thread_addr_offset
,
false
,
false
);
#else
float
tmp
=
__llvm_amdgcn_buffer_load_f32
(
src_wave_buffer_resource
.
data
,
0
,
src_thread_addr_offset
,
false
,
false
);
return
src_thread_data_valid
?
tmp
:
float
(
0
);
#endif
}
template
<
>
__device__
float2_t
amd_buffer_load_v2
<
float
,
2
>
(
const
float
*
p_src_wave
,
index_t
src_thread_data_offset
,
bool
src_thread_data_valid
,
index_t
src_data_range
)
{
BufferResourceConstant
<
float
>
src_wave_buffer_resource
;
// wavewise base address (64 bit)
src_wave_buffer_resource
.
address
[
0
]
=
const_cast
<
float
*>
(
p_src_wave
);
// wavewise range (32 bit)
src_wave_buffer_resource
.
range
[
2
]
=
src_data_range
*
sizeof
(
float
);
// wavewise setting (32 bit)
src_wave_buffer_resource
.
config
[
3
]
=
0x00027000
;
index_t
src_thread_addr_offset
=
src_thread_data_offset
*
sizeof
(
float
);
#if CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK
uint32_t
src_addr_shift
=
src_thread_data_valid
?
0
:
0x7fffffff
;
return
__llvm_amdgcn_buffer_load_f32x2
(
src_wave_buffer_resource
.
data
,
0
,
src_addr_shift
+
src_thread_addr_offset
,
false
,
false
);
#else
float2_t
tmp
=
__llvm_amdgcn_buffer_load_f32x2
(
src_wave_buffer_resource
.
data
,
0
,
src_thread_addr_offset
,
false
,
false
);
return
src_thread_data_valid
?
tmp
:
float2_t
(
0
);
#endif
}
template
<
>
__device__
float4_t
amd_buffer_load_v2
<
float
,
4
>
(
const
float
*
p_src_wave
,
index_t
src_thread_data_offset
,
bool
src_thread_data_valid
,
index_t
src_data_range
)
{
BufferResourceConstant
<
float
>
src_wave_buffer_resource
;
// wavewise base address (64 bit)
src_wave_buffer_resource
.
address
[
0
]
=
const_cast
<
float
*>
(
p_src_wave
);
// wavewise range (32 bit)
src_wave_buffer_resource
.
range
[
2
]
=
src_data_range
*
sizeof
(
float
);
// wavewise setting (32 bit)
src_wave_buffer_resource
.
config
[
3
]
=
0x00027000
;
index_t
src_thread_addr_offset
=
src_thread_data_offset
*
sizeof
(
float
);
#if CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK
uint32_t
src_addr_shift
=
src_thread_data_valid
?
0
:
0x7fffffff
;
return
__llvm_amdgcn_buffer_load_f32x4
(
src_wave_buffer_resource
.
data
,
0
,
src_addr_shift
+
src_thread_addr_offset
,
false
,
false
);
#else
float4_t
tmp
=
__llvm_amdgcn_buffer_load_f32x4
(
src_wave_buffer_resource
.
data
,
0
,
src_thread_addr_offset
,
false
,
false
);
return
src_thread_data_valid
?
tmp
:
float4_t
(
0
);
#endif
}
template
<
>
__device__
void
amd_buffer_store_v2
<
float
,
1
>
(
const
float
src_thread_data
,
float
*
p_dst_wave
,
const
index_t
dst_thread_data_offset
,
const
bool
dst_thread_data_valid
,
const
index_t
dst_data_range
)
{
BufferResourceConstant
<
float
>
dst_wave_buffer_resource
;
// wavewise base address (64 bit)
dst_wave_buffer_resource
.
address
[
0
]
=
p_dst_wave
;
// wavewise range (32 bit)
dst_wave_buffer_resource
.
range
[
2
]
=
dst_data_range
*
sizeof
(
float
);
// wavewise setting (32 bit)
dst_wave_buffer_resource
.
config
[
3
]
=
0x00027000
;
index_t
dst_thread_addr_offset
=
dst_thread_data_offset
*
sizeof
(
float
);
#if CK_EXPERIMENTAL_USE_BUFFER_STORE_OOB_CHECK_OFFSET_TRICK
uint32_t
dst_addr_shift
=
dst_thread_data_valid
?
0
:
0x7fffffff
;
__llvm_amdgcn_buffer_store_f32
(
src_thread_data
,
dst_wave_buffer_resource
.
data
,
0
,
dst_addr_shift
+
dst_thread_addr_offset
,
false
,
false
);
#else
if
(
dst_thread_data_valid
)
{
__llvm_amdgcn_buffer_store_f32
(
src_thread_data
,
dst_wave_buffer_resource
.
data
,
0
,
dst_thread_addr_offset
,
false
,
false
);
}
#endif
}
template
<
>
__device__
void
amd_buffer_store_v2
<
float
,
2
>
(
const
float2_t
src_thread_data
,
float
*
p_dst_wave
,
const
index_t
dst_thread_data_offset
,
const
bool
dst_thread_data_valid
,
const
index_t
dst_data_range
)
{
BufferResourceConstant
<
float
>
dst_wave_buffer_resource
;
// wavewise base address (64 bit)
dst_wave_buffer_resource
.
address
[
0
]
=
p_dst_wave
;
// wavewise range (32 bit)
dst_wave_buffer_resource
.
range
[
2
]
=
dst_data_range
*
sizeof
(
float
);
// wavewise setting (32 bit)
dst_wave_buffer_resource
.
config
[
3
]
=
0x00027000
;
index_t
dst_thread_addr_offset
=
dst_thread_data_offset
*
sizeof
(
float
);
#if CK_EXPERIMENTAL_USE_BUFFER_STORE_OOB_CHECK_OFFSET_TRICK
uint32_t
dst_addr_shift
=
dst_thread_data_valid
?
0
:
0x7fffffff
;
__llvm_amdgcn_buffer_store_f32x2
(
src_thread_data
,
dst_wave_buffer_resource
.
data
,
0
,
dst_addr_shift
+
dst_thread_addr_offset
,
false
,
false
);
#else
if
(
dst_thread_data_valid
)
{
__llvm_amdgcn_buffer_store_f32x2
(
src_thread_data
,
dst_wave_buffer_resource
.
data
,
0
,
dst_thread_addr_offset
,
false
,
false
);
}
#endif
}
template
<
>
__device__
void
amd_buffer_store_v2
<
float
,
4
>
(
const
float4_t
src_thread_data
,
float
*
p_dst_wave
,
const
index_t
dst_thread_data_offset
,
const
bool
dst_thread_data_valid
,
const
index_t
dst_data_range
)
{
BufferResourceConstant
<
float
>
dst_wave_buffer_resource
;
// wavewise base address (64 bit)
dst_wave_buffer_resource
.
address
[
0
]
=
p_dst_wave
;
// wavewise range (32 bit)
dst_wave_buffer_resource
.
range
[
2
]
=
dst_data_range
*
sizeof
(
float
);
// wavewise setting (32 bit)
dst_wave_buffer_resource
.
config
[
3
]
=
0x00027000
;
index_t
dst_thread_addr_offset
=
dst_thread_data_offset
*
sizeof
(
float
);
#if CK_EXPERIMENTAL_USE_BUFFER_STORE_OOB_CHECK_OFFSET_TRICK
uint32_t
dst_addr_shift
=
dst_thread_data_valid
?
0
:
0x7fffffff
;
__llvm_amdgcn_buffer_store_f32x4
(
src_thread_data
,
dst_wave_buffer_resource
.
data
,
0
,
dst_addr_shift
+
dst_thread_addr_offset
,
false
,
false
);
#else
if
(
dst_thread_data_valid
)
{
__llvm_amdgcn_buffer_store_f32x4
(
src_thread_data
,
dst_wave_buffer_resource
.
data
,
0
,
dst_thread_addr_offset
,
false
,
false
);
}
#endif
}
}
// namespace ck
#endif
composable_kernel/include/utility/config.amd.hpp.in
View file @
079b745d
...
...
@@ -58,16 +58,16 @@
#endif
// experimental implementation
#ifndef CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK 0
#ifndef CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK
_OFFSET_TRICK
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK
_OFFSET_TRICK
0
#endif
#ifndef CK_EXPERIMENTAL_USE_BUFFER_STORE_OOB_CHECK
#define CK_EXPERIMENTAL_USE_BUFFER_STORE_OOB_CHECK 1
#ifndef CK_EXPERIMENTAL_USE_BUFFER_STORE_OOB_CHECK
_OFFSET_TRICK
#define CK_EXPERIMENTAL_USE_BUFFER_STORE_OOB_CHECK
_OFFSET_TRICK
1
#endif
#ifndef CK_EXPERIMENTAL_USE_BUFFER_ATOMIC_OOB_CHECK
#define CK_EXPERIMENTAL_USE_BUFFER_ATOMIC_OOB_CHECK 1
#ifndef CK_EXPERIMENTAL_USE_BUFFER_ATOMIC_OOB_CHECK
_OFFSET_TRICK
#define CK_EXPERIMENTAL_USE_BUFFER_ATOMIC_OOB_CHECK
_OFFSET_TRICK
1
#endif
#ifndef CK_EXPERIMENTAL_BLOCKWISE_GEMM_USE_PIPELINE
...
...
composable_kernel/include/utility/in_memory_operation.amd.hpp.in
View file @
079b745d
...
...
@@ -5,6 +5,7 @@
#if CK_USE_AMD_BUFFER_ADDRESSING
#include "amd_buffer_addressing.hpp"
#include "amd_buffer_addressing_v2.hpp"
#endif
namespace ck {
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment