Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
3b3b9623
Commit
3b3b9623
authored
Nov 18, 2019
by
Chao Liu
Browse files
clean up
parent
8b45553d
Changes
3
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
17 additions
and
91 deletions
+17
-91
composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp
...tensor_operation/threadwise_generic_tensor_slice_copy.hpp
+0
-90
composable_kernel/include/utility/amd_buffer_addressing.hpp
composable_kernel/include/utility/amd_buffer_addressing.hpp
+8
-0
composable_kernel/include/utility/in_memory_operation.amd.hpp.in
...ble_kernel/include/utility/in_memory_operation.amd.hpp.in
+9
-1
No files found.
composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp
View file @
3b3b9623
...
@@ -117,29 +117,12 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
...
@@ -117,29 +117,12 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
// has the same padding situation
// has the same padding situation
if
(
src_coord
.
IsUpperIndexMappedToValidOffset
())
if
(
src_coord
.
IsUpperIndexMappedToValidOffset
())
{
{
#if 0 // debug
static_if<SrcAddressSpace == AddressSpace::global>{}([&](auto fwd) {
#if CK_USE_AMD_BUFFER_ADDRESSING
*reinterpret_cast<src_vector_t*>(&p_src_long_vector[buffer_offset]) =
amd_intrinsic_buffer_load<SrcData, SrcDataPerAccess>(
fwd(p_src), src_coord.GetOffset(), 0);
#else
*reinterpret_cast<src_vector_t*>(&p_src_long_vector[buffer_offset]) =
*reinterpret_cast<const src_vector_t*>(&p_src[src_coord.GetOffset()]);
#endif
}).
Else
([
&
](
auto
)
{
// src can be all kinds of memory-space.
*
reinterpret_cast
<
src_vector_t
*>
(
&
p_src_long_vector
[
buffer_offset
])
=
*
reinterpret_cast
<
const
src_vector_t
*>
(
&
p_src
[
src_coord
.
GetOffset
()]);
});
#else
move_data
<
SrcData
,
move_data
<
SrcData
,
SrcDataPerAccess
,
SrcDataPerAccess
,
SrcAddressSpace
,
SrcAddressSpace
,
AddressSpace
::
vgpr
,
AddressSpace
::
vgpr
,
InMemoryDataOperation
::
none
>
(
InMemoryDataOperation
::
none
>
(
p_src
,
src_coord
.
GetOffset
(),
p_src_long_vector
,
buffer_offset
);
p_src
,
src_coord
.
GetOffset
(),
p_src_long_vector
,
buffer_offset
);
#endif
}
}
}
}
...
@@ -166,31 +149,12 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
...
@@ -166,31 +149,12 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
// has the same padding situation
// has the same padding situation
if
(
dst_coord
.
IsUpperIndexMappedToValidOffset
())
if
(
dst_coord
.
IsUpperIndexMappedToValidOffset
())
{
{
#if 0 // debug
static_if<DstAddressSpace == AddressSpace::global>{}([&](auto fwd) {
#if CK_USE_AMD_BUFFER_ADDRESSING
amd_intrinsic_buffer_store<DstData, DstDataPerAccess>(
*reinterpret_cast<dst_vector_t*>(&p_dst_long_vector[buffer_offset]),
fwd(p_dst),
dst_coord.GetOffset(),
0);
#else
*reinterpret_cast<dst_vector_t*>(&p_dst[dst_coord.GetOffset()]) =
*reinterpret_cast<dst_vector_t*>(&p_dst_long_vector[buffer_offset]);
#endif
}).
Else
([
&
](
auto
)
{
// dst can be all kinds of memory-space
*
reinterpret_cast
<
dst_vector_t
*>
(
&
p_dst
[
dst_coord
.
GetOffset
()])
=
*
reinterpret_cast
<
dst_vector_t
*>
(
&
p_dst_long_vector
[
buffer_offset
]);
});
#else
move_data
<
DstData
,
move_data
<
DstData
,
DstDataPerAccess
,
DstDataPerAccess
,
AddressSpace
::
vgpr
,
AddressSpace
::
vgpr
,
DstAddressSpace
,
DstAddressSpace
,
DstInMemOp
>
(
DstInMemOp
>
(
p_dst_long_vector
,
buffer_offset
,
p_dst
,
dst_coord
.
GetOffset
());
p_dst_long_vector
,
buffer_offset
,
p_dst
,
dst_coord
.
GetOffset
());
#endif
}
}
}
}
});
});
...
@@ -204,9 +168,6 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
...
@@ -204,9 +168,6 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
return
Sequence
<
(
Mask
?
Lengths
:
1
)...
>
{};
return
Sequence
<
(
Mask
?
Lengths
:
1
)...
>
{};
}
}
// p_src must be global-memory, p_dst can be any memory-space.
// User should make sure p_src is a block-invariant pointer, because
// buffer_load is used for loading from global-memory into register buffer.
// Will do padding check on src data: Read 0 if src data is in padding area.
// Will do padding check on src data: Read 0 if src data is in padding area.
// Will do padding check on dst data: No write if dst data is in paddin area.
// Will do padding check on dst data: No write if dst data is in paddin area.
// This version is optimized for address calculation of src tensor
// This version is optimized for address calculation of src tensor
...
@@ -308,23 +269,6 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
...
@@ -308,23 +269,6 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
// the src vector has the same padding situation
// the src vector has the same padding situation
if
(
src_coord
.
IsUpperIndexMappedToValidOffset
())
if
(
src_coord
.
IsUpperIndexMappedToValidOffset
())
{
{
#if 0 // debug
static_if<SrcAddressSpace == AddressSpace::global>{}([&](auto) {
#if CK_USE_AMD_BUFFER_ADDRESSING
*reinterpret_cast<src_vector_t*>(&p_src_long_vector[buffer_offset]) =
amd_intrinsic_buffer_load<SrcData, SrcDataPerAccess>(
p_src, src_nonlinear_coord.GetOffset(), src_linear_offset);
#else
*reinterpret_cast<src_vector_t*>(&p_src_long_vector[buffer_offset]) =
*reinterpret_cast<const src_vector_t*>(
&p_src[src_nonlinear_coord.GetOffset() + src_linear_offset]);
#endif
}).
Else
([
&
](
auto
)
{
*
reinterpret_cast
<
src_vector_t
*>
(
&
p_src_long_vector
[
buffer_offset
])
=
*
reinterpret_cast
<
const
src_vector_t
*>
(
&
p_src
[
src_nonlinear_coord
.
GetOffset
()
+
src_linear_offset
]);
});
#else
move_data
<
SrcData
,
move_data
<
SrcData
,
SrcDataPerAccess
,
SrcDataPerAccess
,
SrcAddressSpace
,
SrcAddressSpace
,
...
@@ -334,7 +278,6 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
...
@@ -334,7 +278,6 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
src_linear_offset
,
src_linear_offset
,
p_src_long_vector
,
p_src_long_vector
,
buffer_offset
);
buffer_offset
);
#endif
}
}
}
}
...
@@ -364,26 +307,18 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
...
@@ -364,26 +307,18 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
// the dst vector has the same padding situation
// the dst vector has the same padding situation
if
(
dst_coord
.
IsUpperIndexMappedToValidOffset
())
if
(
dst_coord
.
IsUpperIndexMappedToValidOffset
())
{
{
#if 0 // debug
*reinterpret_cast<dst_vector_t*>(&p_dst[dst_coord.GetOffset()]) =
*reinterpret_cast<dst_vector_t*>(&p_dst_long_vector[buffer_offset]);
#else
move_data
<
DstData
,
move_data
<
DstData
,
DstDataPerAccess
,
DstDataPerAccess
,
AddressSpace
::
vgpr
,
AddressSpace
::
vgpr
,
DstAddressSpace
,
DstAddressSpace
,
DstInMemOp
>
(
DstInMemOp
>
(
p_dst_long_vector
,
buffer_offset
,
p_dst
,
dst_coord
.
GetOffset
());
p_dst_long_vector
,
buffer_offset
,
p_dst
,
dst_coord
.
GetOffset
());
#endif
}
}
}
}
});
});
});
});
}
}
// p_src could be any memory space, d_dst must be global memory.
// User should make sure p_dst is a block-invariant pointer, because
// buffer_load is used for storing data from regsiter buffer into global-memory.
// Will do padding check on src data: Read 0 if src data is in padding area.
// Will do padding check on src data: Read 0 if src data is in padding area.
// Will do padding check on dst data: No write if dst data is in paddin area.
// Will do padding check on dst data: No write if dst data is in paddin area.
// This version is optimized for address calculation of dst tensor
// This version is optimized for address calculation of dst tensor
...
@@ -476,17 +411,12 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
...
@@ -476,17 +411,12 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
// the src vector has the same padding situation
// the src vector has the same padding situation
if
(
src_coord
.
IsUpperIndexMappedToValidOffset
())
if
(
src_coord
.
IsUpperIndexMappedToValidOffset
())
{
{
#if 0
*reinterpret_cast<src_vector_t*>(&p_src_long_vector[buffer_offset]) =
*reinterpret_cast<const src_vector_t*>(&p_src[src_coord.GetOffset()]);
#else
move_data
<
SrcData
,
move_data
<
SrcData
,
SrcDataPerAccess
,
SrcDataPerAccess
,
SrcAddressSpace
,
SrcAddressSpace
,
AddressSpace
::
vgpr
,
AddressSpace
::
vgpr
,
InMemoryDataOperation
::
none
>
(
InMemoryDataOperation
::
none
>
(
p_src
,
src_coord
.
GetOffset
(),
p_src_long_vector
,
buffer_offset
);
p_src
,
src_coord
.
GetOffset
(),
p_src_long_vector
,
buffer_offset
);
#endif
}
}
}
}
...
@@ -525,25 +455,6 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
...
@@ -525,25 +455,6 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
// the dst vector has the same padding situation
// the dst vector has the same padding situation
if
(
dst_coord
.
IsUpperIndexMappedToValidOffset
())
if
(
dst_coord
.
IsUpperIndexMappedToValidOffset
())
{
{
#if 0
static_if<DstAddressSpace == AddressSpace::global>{}([&](auto) {
#if CK_USE_AMD_BUFFER_ADDRESSING
amd_intrinsic_buffer_store<DstData, DstDataPerAccess>(
*reinterpret_cast<dst_vector_t*>(&p_dst_long_vector[buffer_offset]),
p_dst,
dst_nonlinear_coord.GetOffset(),
dst_linear_offset);
#else
*reinterpret_cast<dst_vector_t*>(
&p_dst[dst_nonlinear_coord.GetOffset() + dst_linear_offset]) =
*reinterpret_cast<dst_vector_t*>(&p_dst_long_vector[buffer_offset]);
#endif
}).
Else
([
&
](
auto
)
{
*
reinterpret_cast
<
dst_vector_t
*>
(
&
p_dst
[
dst_nonlinear_coord
.
GetOffset
()
+
dst_linear_offset
])
=
*
reinterpret_cast
<
dst_vector_t
*>
(
&
p_dst_long_vector
[
buffer_offset
]);
});
#else
move_data
<
DstData
,
move_data
<
DstData
,
DstDataPerAccess
,
DstDataPerAccess
,
AddressSpace
::
vgpr
,
AddressSpace
::
vgpr
,
...
@@ -552,7 +463,6 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
...
@@ -552,7 +463,6 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
buffer_offset
,
buffer_offset
,
p_dst
,
p_dst
,
dst_nonlinear_coord
.
GetOffset
()
+
dst_linear_offset
);
dst_nonlinear_coord
.
GetOffset
()
+
dst_linear_offset
);
#endif
}
}
}
}
});
});
...
...
composable_kernel/include/utility/amd_buffer_addressing.hpp
View file @
3b3b9623
...
@@ -54,10 +54,18 @@ __device__ void __llvm_amdgcn_buffer_storex4(float4_t vdata,
...
@@ -54,10 +54,18 @@ __device__ void __llvm_amdgcn_buffer_storex4(float4_t vdata,
bool
glc
,
bool
glc
,
bool
slc
)
__asm
(
"llvm.amdgcn.buffer.store.v4f32"
);
bool
slc
)
__asm
(
"llvm.amdgcn.buffer.store.v4f32"
);
// buffer_load requires:
// 1) p_src must be in global memory space, d_dst must be vgpr
// 2) p_src to be a block-invariant pointer.
// It is user's responsibility to make sure that is true.
template
<
typename
T
,
index_t
VectorSize
>
template
<
typename
T
,
index_t
VectorSize
>
__device__
typename
vector_type
<
T
,
VectorSize
>::
MemoryType
amd_intrinsic_buffer_load
(
__device__
typename
vector_type
<
T
,
VectorSize
>::
MemoryType
amd_intrinsic_buffer_load
(
const
T
*
p_src_block
,
index_t
src_thread_data_offset
,
index_t
src_const_data_offset
);
const
T
*
p_src_block
,
index_t
src_thread_data_offset
,
index_t
src_const_data_offset
);
// buffer_store requires:
// 1) p_src must be in vgpr space, d_dst must be global memory
// 2) p_dst to be a block-invariant pointer.
// It is user's responsibility to make sure that is true.
template
<
typename
T
,
index_t
VectorSize
>
template
<
typename
T
,
index_t
VectorSize
>
__device__
void
__device__
void
amd_intrinsic_buffer_store
(
const
typename
vector_type
<
T
,
VectorSize
>::
MemoryType
&
src
,
amd_intrinsic_buffer_store
(
const
typename
vector_type
<
T
,
VectorSize
>::
MemoryType
&
src
,
...
...
composable_kernel/include/utility/in_memory_operation.amd.hpp.in
View file @
3b3b9623
...
@@ -15,12 +15,20 @@ __device__ void copy_data(const T* p_src, index_t src_offset, T* p_dst, index_t
...
@@ -15,12 +15,20 @@ __device__ void copy_data(const T* p_src, index_t src_offset, T* p_dst, index_t
using vector_t = typename vector_type<T, DataPerAccess>::MemoryType;
using vector_t = typename vector_type<T, DataPerAccess>::MemoryType;
#if CK_USE_AMD_BUFFER_ADDRESSING
#if CK_USE_AMD_BUFFER_ADDRESSING
// TODO: use static_if::ElseIf
// TODO: use static_if::ElseIf
, instead of nested static_if
static_if<SrcAddressSpace == AddressSpace::global && DstAddressSpace == vgpr>{}([&](auto) {
static_if<SrcAddressSpace == AddressSpace::global && DstAddressSpace == vgpr>{}([&](auto) {
// buffer_load requires:
// 1) p_src must be in global memory space, d_dst must be vgpr
// 2) p_src to be a block-invariant pointer.
// It is user's responsibility to make sure that is true.
*reinterpret_cast<vector_t*>(&p_dst[dst_offset]) =
*reinterpret_cast<vector_t*>(&p_dst[dst_offset]) =
amd_intrinsic_buffer_load<T, DataPerAccess>(p_src, src_offset, 0);
amd_intrinsic_buffer_load<T, DataPerAccess>(p_src, src_offset, 0);
}).Else([&](auto) {
}).Else([&](auto) {
static_if<SrcAddressSpace == AddressSpace::vgpr && DstAddressSpace == global>{}([&](auto) {
static_if<SrcAddressSpace == AddressSpace::vgpr && DstAddressSpace == global>{}([&](auto) {
// buffer_store requires:
// 1) p_src must be in vgpr space, d_dst must be global memory
// 2) p_dst to be a block-invariant pointer.
// It is user's responsibility to make sure that is true.
amd_intrinsic_buffer_store<T, DataPerAccess>(
amd_intrinsic_buffer_store<T, DataPerAccess>(
*reinterpret_cast<const vector_t*>(&p_src[src_offset]), p_dst, dst_offset, 0);
*reinterpret_cast<const vector_t*>(&p_src[src_offset]), p_dst, dst_offset, 0);
}).Else([&](auto) {
}).Else([&](auto) {
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment