Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
yangql
composable_kernel-1
Commits
b3d4595f
Commit
b3d4595f
authored
Sep 25, 2019
by
Chao Liu
Browse files
added type conversion in threadwise and blockwise copy
parent
3cb2a7d0
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
129 additions
and
77 deletions
+129
-77
composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp
...cit_gemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp
+29
-22
composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp
.../tensor_operation/blockwise_generic_tensor_slice_copy.hpp
+91
-47
composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp
...tensor_operation/threadwise_generic_tensor_slice_copy.hpp
+6
-5
composable_kernel/include/utility/config_amd.hpp.in
composable_kernel/include/utility/config_amd.hpp.in
+3
-3
No files found.
composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp
View file @
b3d4595f
...
@@ -287,9 +287,9 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buf
...
@@ -287,9 +287,9 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buf
// LDS double buffer: preload data into LDS
// LDS double buffer: preload data into LDS
{
{
blockwise_in_copy
.
template
Run
<
Float
,
address_space_t
::
global
,
address_space_t
::
lds
>(
blockwise_in_copy
.
template
Run
<
Float
,
Float
,
address_space_t
::
global
>(
p_in_global
,
p_in_block_double
);
p_in_global
,
p_in_block_double
);
blockwise_wei_copy
.
template
Run
<
Float
,
address_space_t
::
global
,
address_space_t
::
lds
>(
blockwise_wei_copy
.
template
Run
<
Float
,
Float
,
address_space_t
::
global
>(
p_wei_global
,
p_wei_block_double
);
p_wei_global
,
p_wei_block_double
);
}
}
...
@@ -312,8 +312,8 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buf
...
@@ -312,8 +312,8 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buf
Float
*
p_wei_block_next
=
Float
*
p_wei_block_next
=
even_loop
?
p_wei_block_double
+
wei_block_space
:
p_wei_block_double
;
even_loop
?
p_wei_block_double
+
wei_block_space
:
p_wei_block_double
;
Float
p_in_
register
_buffer
[
blockwise_in_copy
.
Get
Register
BufferSize
()];
Float
p_in_
thread
_buffer
[
blockwise_in_copy
.
Get
Thread
BufferSize
()];
Float
p_wei_
register
_buffer
[
blockwise_wei_copy
.
Get
Register
BufferSize
()];
Float
p_wei_
thread
_buffer
[
blockwise_wei_copy
.
Get
Thread
BufferSize
()];
blockwise_in_copy
.
MoveSrcSliceWindow
(
Sequence
<
EPerBlock
,
0
,
0
,
0
>
{},
True
);
blockwise_in_copy
.
MoveSrcSliceWindow
(
Sequence
<
EPerBlock
,
0
,
0
,
0
>
{},
True
);
blockwise_wei_copy
.
MoveSrcSliceWindow
(
Sequence
<
EPerBlock
,
0
>
{},
True
);
blockwise_wei_copy
.
MoveSrcSliceWindow
(
Sequence
<
EPerBlock
,
0
>
{},
True
);
...
@@ -321,25 +321,27 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buf
...
@@ -321,25 +321,27 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buf
__syncthreads
();
__syncthreads
();
// LDS doubel buffer: load next data from device mem
// LDS doubel buffer: load next data from device mem
blockwise_in_copy
.
template
RunLoadRegisterBuffer
<
Float
,
address_space_t
::
global
>(
blockwise_in_copy
p_in_global
,
p_in_register_buffer
);
.
template
RunLoadThreadBuffer
<
Float
,
Float
,
address_space_t
::
global
>(
blockwise_wei_copy
.
template
RunLoadRegisterBuffer
<
Float
,
address_space_t
::
global
>(
p_in_global
,
p_in_thread_buffer
);
p_wei_global
,
p_wei_register_buffer
);
blockwise_wei_copy
.
template
RunLoadThreadBuffer
<
Float
,
Float
,
address_space_t
::
global
>(
p_wei_global
,
p_wei_thread_buffer
);
// LDS double buffer: GEMM on current data
// LDS double buffer: GEMM on current data
blockwise_gemm
.
Run
(
p_wei_block_now
,
p_in_block_now
,
p_out_thread
);
blockwise_gemm
.
Run
(
p_wei_block_now
,
p_in_block_now
,
p_out_thread
);
// LDS double buffer: store next data to LDS
// LDS double buffer: store next data to LDS
blockwise_in_copy
.
RunStore
Register
Buffer
(
p_in_
register
_buffer
,
p_in_block_next
);
blockwise_in_copy
.
RunStore
Thread
Buffer
(
p_in_
thread
_buffer
,
p_in_block_next
);
blockwise_wei_copy
.
RunStore
Register
Buffer
(
p_wei_
register
_buffer
,
p_wei_block_next
);
blockwise_wei_copy
.
RunStore
Thread
Buffer
(
p_wei_
thread
_buffer
,
p_wei_block_next
);
}
}
}
}
// LDS double buffer: tail
// LDS double buffer: tail
{
{
// even iteration
// even iteration
Float
p_in_
register
_buffer
[
blockwise_in_copy
.
Get
Register
BufferSize
()];
Float
p_in_
thread
_buffer
[
blockwise_in_copy
.
Get
Thread
BufferSize
()];
Float
p_wei_
register
_buffer
[
blockwise_wei_copy
.
Get
Register
BufferSize
()];
Float
p_wei_
thread
_buffer
[
blockwise_wei_copy
.
Get
Thread
BufferSize
()];
blockwise_in_copy
.
MoveSrcSliceWindow
(
Sequence
<
EPerBlock
,
0
,
0
,
0
>
{},
True
);
blockwise_in_copy
.
MoveSrcSliceWindow
(
Sequence
<
EPerBlock
,
0
,
0
,
0
>
{},
True
);
blockwise_wei_copy
.
MoveSrcSliceWindow
(
Sequence
<
EPerBlock
,
0
>
{},
True
);
blockwise_wei_copy
.
MoveSrcSliceWindow
(
Sequence
<
EPerBlock
,
0
>
{},
True
);
...
@@ -347,19 +349,19 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buf
...
@@ -347,19 +349,19 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buf
__syncthreads
();
__syncthreads
();
// LDS doubel buffer: load next data from device mem
// LDS doubel buffer: load next data from device mem
blockwise_in_copy
.
template
RunLoad
RegisterBuffer
<
Float
,
address_space_t
::
global
>(
blockwise_in_copy
.
template
RunLoad
ThreadBuffer
<
Float
,
Float
,
address_space_t
::
global
>(
p_in_global
,
p_in_
register
_buffer
);
p_in_global
,
p_in_
thread
_buffer
);
blockwise_wei_copy
.
template
RunLoad
RegisterBuffer
<
Float
,
address_space_t
::
global
>(
blockwise_wei_copy
.
template
RunLoad
ThreadBuffer
<
Float
,
Float
,
address_space_t
::
global
>(
p_wei_global
,
p_wei_
register
_buffer
);
p_wei_global
,
p_wei_
thread
_buffer
);
// LDS double buffer: GEMM on current data
// LDS double buffer: GEMM on current data
blockwise_gemm
.
Run
(
p_wei_block_double
,
p_in_block_double
,
p_out_thread
);
blockwise_gemm
.
Run
(
p_wei_block_double
,
p_in_block_double
,
p_out_thread
);
// LDS double buffer: store next data to LDS
// LDS double buffer: store next data to LDS
blockwise_in_copy
.
RunStore
Register
Buffer
(
p_in_
register
_buffer
,
blockwise_in_copy
.
RunStore
Thread
Buffer
(
p_in_
thread
_buffer
,
p_in_block_double
+
in_block_space
);
p_in_block_double
+
in_block_space
);
blockwise_wei_copy
.
RunStore
Register
Buffer
(
p_wei_
register
_buffer
,
blockwise_wei_copy
.
RunStore
Thread
Buffer
(
p_wei_
thread
_buffer
,
p_wei_block_double
+
wei_block_space
);
p_wei_block_double
+
wei_block_space
);
// odd iteration
// odd iteration
__syncthreads
();
__syncthreads
();
...
@@ -431,9 +433,14 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buf
...
@@ -431,9 +433,14 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buf
b_thread_data_on_global
,
b_thread_data_on_global
,
0
})
0
})
#if 1
#if 1
.
template
Run_generic
<
Float
,
Float
,
address_space_t
::
generic
,
address_space_t
::
global
>
.
template
Run_generic
<
Float
,
Float
,
address_space_t
::
generic
,
address_space_t
::
global
>
#elif 1
#elif 1
.
template
Run_optimized_dst_address_calculation
<
Float
,
Float
,
address_space_t
::
global
>
.
template
Run_optimized_dst_address_calculation
<
Float
,
Float
,
address_space_t
::
global
>
#endif
#endif
(
p_out_thread
,
p_out_global
);
(
p_out_thread
,
p_out_global
);
}
}
...
...
composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp
View file @
b3d4595f
...
@@ -678,10 +678,10 @@ struct BlockwiseGenericTensorSliceCopy_v3
...
@@ -678,10 +678,10 @@ struct BlockwiseGenericTensorSliceCopy_v3
};
};
template
<
index_t
BlockSize
,
template
<
index_t
BlockSize
,
typename
SrcDesc
,
typename
Block
SrcDesc
,
typename
DstDesc
,
typename
Block
DstDesc
,
typename
SliceLengths
,
typename
Block
SliceLengths
,
typename
Sub
Lengths
,
typename
ThreadSlice
Lengths
,
typename
ThreadClusterLengths
,
typename
ThreadClusterLengths
,
typename
ThreadClusterArrangeOrder
,
typename
ThreadClusterArrangeOrder
,
typename
SrcDimAccessOrder
,
typename
SrcDimAccessOrder
,
...
@@ -692,24 +692,49 @@ template <index_t BlockSize,
...
@@ -692,24 +692,49 @@ template <index_t BlockSize,
index_t
DstDataPerAccess
>
index_t
DstDataPerAccess
>
struct
BlockwiseGenericTensorSliceCopy_v4
struct
BlockwiseGenericTensorSliceCopy_v4
{
{
static
constexpr
index_t
nDim
=
SrcDesc
::
GetNumOfDimension
();
static
constexpr
index_t
nDim
=
Block
SrcDesc
::
GetNumOfDimension
();
using
Index
=
MultiIndex
<
nDim
>
;
using
Index
=
MultiIndex
<
nDim
>
;
__device__
constexpr
BlockwiseGenericTensorSliceCopy_v4
(
const
Index
&
src_block_slice_origin
,
__device__
constexpr
BlockwiseGenericTensorSliceCopy_v4
(
const
Index
&
src_block_slice_origin
,
const
Index
&
dst_block_slice_origin
)
const
Index
&
dst_block_slice_origin
)
{
{
static_assert
(
nDim
==
SrcDesc
::
GetNumOfDimension
()
&&
static_assert
(
nDim
==
BlockSrcDesc
::
GetNumOfDimension
()
&&
nDim
==
DstDesc
::
GetNumOfDimension
()
&&
nDim
==
SliceLengths
::
Size
()
&&
nDim
==
BlockDstDesc
::
GetNumOfDimension
()
&&
nDim
==
SubLengths
::
Size
()
&&
nDim
==
ThreadClusterLengths
::
Size
()
&&
nDim
==
BlockSliceLengths
::
Size
()
&&
nDim
==
ThreadSliceLengths
::
Size
()
&&
nDim
==
ThreadClusterLengths
::
Size
()
&&
nDim
==
ThreadClusterArrangeOrder
::
Size
()
&&
nDim
==
ThreadClusterArrangeOrder
::
Size
()
&&
nDim
==
SrcDimAccessOrder
::
Size
()
&&
nDim
==
DstDimAccessOrder
::
Size
(),
nDim
==
SrcDimAccessOrder
::
Size
()
&&
nDim
==
DstDimAccessOrder
::
Size
(),
"wrong! nDim not consistent"
);
"wrong! nDim not consistent"
);
static_assert
(
is_same
<
SliceLengths
,
decltype
(
SubLengths
{}
*
ThreadClusterLengths
{})
>
{},
static_assert
(
"wrong! threads should be mapped to cover entire slicing window"
);
is_same
<
BlockSliceLengths
,
decltype
(
ThreadSliceLengths
{}
*
ThreadClusterLengths
{})
>
{},
"wrong! threads should be mapped to cover entire slicing window"
);
#if 1
constexpr
auto
thread_cluster_desc
=
make_ConstantTensorDescriptor_packed
(
constexpr
auto
thread_cluster_desc
=
make_ConstantTensorDescriptor_packed
(
ThreadClusterLengths
::
ReorderGivenNew2Old
(
ThreadClusterArrangeOrder
{}));
ThreadClusterLengths
::
ReorderGivenNew2Old
(
ThreadClusterArrangeOrder
{}));
#else
constexpr
auto
thread_cluster_lengths_in_arrange_order
=
ThreadClusterLengths
::
ReorderGivenNew2Old
(
ThreadClusterArrangeOrder
{});
constexpr
auto
thread_cluster_desc
=
transform_tensor_descriptor
(
make_native_tensor_descriptor_packed
(
thread_cluster_lengths_in_arrange_order
),
make_tuple
(
Merge
<
decltype
(
thread_cluster_lengths_in_arrange_order
)
>
{}),
make_tuple
(
arithmetic
)
::
ReorderGivenNew2Old
(
ThreadClusterArrangeOrder
{}));
static_assert
(
BlockSize
==
thread_cluster_desc
.
GetElementSize
(),
"wrong! BlockSize not consistent with ThreadClusterLengths"
);
constexpr
auto
thread_cluster_id
=
transform_tensor_descriptor
(
make_native_tensor_descriptor_packed
(
Sequence
<
KBlockWork
,
BBlockWork
>
{}),
make_tuple
(
Merge
<
Sequence
<
KBlockWork
,
BBlockWork
>>
{}),
make_tuple
(
Sequence
<
0
,
1
>
{}),
make_tuple
(
Sequence
<
0
>
{}));
const
auto
block_work_multi_id
=
block_work_desc
.
CalculateLowerIndex
(
get_block_1d_id
());
#endif
static_assert
(
BlockSize
==
thread_cluster_desc
.
GetElementSize
(),
static_assert
(
BlockSize
==
thread_cluster_desc
.
GetElementSize
(),
"wrong! BlockSize not consistent with ThreadClusterLengths"
);
"wrong! BlockSize not consistent with ThreadClusterLengths"
);
...
@@ -720,7 +745,7 @@ struct BlockwiseGenericTensorSliceCopy_v4
...
@@ -720,7 +745,7 @@ struct BlockwiseGenericTensorSliceCopy_v4
const
auto
data_cluster_id
=
const
auto
data_cluster_id
=
reorder_array_given_old2new
(
thread_cluster_id
,
ThreadClusterArrangeOrder
{});
reorder_array_given_old2new
(
thread_cluster_id
,
ThreadClusterArrangeOrder
{});
const
auto
thread_data_id_begin
=
data_cluster_id
*
Sub
Lengths
{};
const
auto
thread_data_id_begin
=
data_cluster_id
*
ThreadSlice
Lengths
{};
mThreadwiseLoad
.
SetSrcSliceOrigin
(
src_block_slice_origin
+
thread_data_id_begin
);
mThreadwiseLoad
.
SetSrcSliceOrigin
(
src_block_slice_origin
+
thread_data_id_begin
);
mThreadwiseLoad
.
SetDstSliceOrigin
(
make_zero_array
<
index_t
,
nDim
>
());
mThreadwiseLoad
.
SetDstSliceOrigin
(
make_zero_array
<
index_t
,
nDim
>
());
...
@@ -729,51 +754,70 @@ struct BlockwiseGenericTensorSliceCopy_v4
...
@@ -729,51 +754,70 @@ struct BlockwiseGenericTensorSliceCopy_v4
mThreadwiseStore
.
SetDstSliceOrigin
(
dst_block_slice_origin
+
thread_data_id_begin
);
mThreadwiseStore
.
SetDstSliceOrigin
(
dst_block_slice_origin
+
thread_data_id_begin
);
}
}
__device__
static
constexpr
index_t
Get
Register
BufferSize
()
__device__
static
constexpr
index_t
Get
Thread
BufferSize
()
{
{
return
Register
BufferDesc
::
GetElementSpace
();
return
Thread
BufferDesc
::
GetElementSpace
();
}
}
template
<
typename
SrcData
,
typename
BufferData
,
address_space_t
SrcAddressSpace
=
address_space_t
::
generic
>
template
<
typename
BlockSrcData
,
__device__
void
RunLoadRegisterBuffer
(
const
SrcData
*
p_src
,
BufferData
*
p_buffer
)
const
typename
ThreadBufferData
,
address_space_t
BlockSrcAddressSpace
=
address_space_t
::
generic
,
address_space_t
ThreadBufferAddressSpace
=
address_space_t
::
generic
>
__device__
void
RunLoadThreadBuffer
(
const
BlockSrcData
*
p_block_src
,
ThreadBufferData
*
p_thread_buffer
)
const
{
{
#if 1
#if 1
mThreadwiseLoad
.
template
Run_generic
<
SrcData
,
BufferData
,
SrcAddressSpace
,
address_space_t
::
generic
>(
mThreadwiseLoad
.
template
Run_generic
<
BlockSrcData
,
p_src
,
p_buffer
);
ThreadBufferData
,
BlockSrcAddressSpace
,
ThreadBufferAddressSpace
>(
p_block_src
,
p_thread_buffer
);
#else
#else
mThreadwiseLoad
.
template
Run_optimized_src_address_calculation
<
SrcData
,
mThreadwiseLoad
.
template
Run_optimized_src_address_calculation
<
Block
SrcData
,
BufferData
,
Thread
BufferData
,
SrcAddressSpace
,
Block
SrcAddressSpace
,
a
ddress
_s
pace
_t
::
generic
>(
ThreadBufferA
ddress
S
pace
>(
p_
src
,
p
_buffer
);
p_
block_src
,
p_thread
_buffer
);
#endif
#endif
}
}
template
<
typename
BufferData
,
typename
DstData
,
address_space_t
DstAddressSpace
=
address_space_t
::
generic
>
template
<
typename
ThreadBufferData
,
__device__
void
RunStoreRegisterBuffer
(
const
BufferData
*
p_buffer
,
DstData
*
p_dst
)
const
typename
BlockDstData
,
address_space_t
ThreadBufferAddressSpace
=
address_space_t
::
generic
,
address_space_t
BlockDstAddressSpace
=
address_space_t
::
generic
>
__device__
void
RunStoreThreadBuffer
(
const
ThreadBufferData
*
p_thread_buffer
,
BlockDstData
*
p_block_dst
)
const
{
{
#if 1
#if 1
mThreadwiseStore
.
template
Run_generic
<
BufferData
,
DstData
,
address_space_t
::
generic
,
DstAddressSpace
>(
mThreadwiseStore
.
template
Run_generic
<
ThreadBufferData
,
p_buffer
,
p_dst
);
BlockDstData
,
ThreadBufferAddressSpace
,
BlockDstAddressSpace
>(
p_thread_buffer
,
p_block_dst
);
#else
#else
mThreadwiseStore
.
template
Run_optimized_dst_address_calculation
<
BufferData
,
mThreadwiseStore
.
template
Run_optimized_dst_address_calculation
<
Thread
BufferData
,
DstData
,
Block
DstData
,
a
ddress
_s
pace
_t
::
generic
,
ThreadBufferA
ddress
S
pace
,
DstAddressSpace
>(
p_buffer
,
Block
DstAddressSpace
>(
p
_dst
);
p_thread_buffer
,
p_block
_dst
);
#endif
#endif
}
}
template
<
typename
SrcData
,
template
<
typename
Block
SrcData
,
typename
DstData
,
typename
Block
DstData
,
address_space_t
SrcAddressSpace
=
address_space_t
::
generic
,
address_space_t
Block
SrcAddressSpace
=
address_space_t
::
generic
,
address_space_t
DstAddressSpace
=
address_space_t
::
generic
>
address_space_t
Block
DstAddressSpace
=
address_space_t
::
generic
>
__device__
void
Run
(
const
SrcData
*
p_src
,
DstData
*
p_dst
)
const
__device__
void
Run
(
const
Block
SrcData
*
p_
block_
src
,
Block
DstData
*
p_
block_
dst
)
const
{
{
SrcData
p_src_buffer
[
GetRegisterBufferSize
()];
BlockSrcData
p_thread_buffer
[
GetThreadBufferSize
()];
RunLoadRegisterBuffer
<
SrcData
,
SrcData
,
SrcAddressSpace
>
(
p_src
,
p_buffer
);
RunLoadThreadBuffer
<
BlockSrcData
,
RunStoreRegisterBuffer
<
SrcData
,
DstData
,
DstAddressSpace
>
(
p_buffer
,
p_dst
);
BlockSrcData
,
BlockSrcAddressSpace
,
address_space_t
::
generic
>
(
p_block_src
,
p_thread_buffer
);
RunStoreThreadBuffer
<
BlockSrcData
,
BlockDstData
,
address_space_t
::
generic
,
BlockDstAddressSpace
>
(
p_thread_buffer
,
p_block_dst
);
}
}
template
<
typename
T
,
bool
PositiveDirection
>
template
<
typename
T
,
bool
PositiveDirection
>
...
@@ -793,19 +837,19 @@ struct BlockwiseGenericTensorSliceCopy_v4
...
@@ -793,19 +837,19 @@ struct BlockwiseGenericTensorSliceCopy_v4
}
}
private:
private:
using
Register
BufferDesc
=
decltype
(
make_native_tensor_descriptor_packed
(
Sub
Lengths
{}));
using
Thread
BufferDesc
=
decltype
(
make_native_tensor_descriptor_packed
(
ThreadSlice
Lengths
{}));
using
ThreadwiseLoad
=
ThreadwiseGenericTensorSliceCopy_v4r2
<
SrcDesc
,
using
ThreadwiseLoad
=
ThreadwiseGenericTensorSliceCopy_v4r2
<
Block
SrcDesc
,
Register
BufferDesc
,
Thread
BufferDesc
,
Sub
Lengths
,
ThreadSlice
Lengths
,
SrcDimAccessOrder
,
SrcDimAccessOrder
,
SrcVectorAccessDim
,
SrcVectorAccessDim
,
SrcDataPerAccess
,
SrcDataPerAccess
,
1
>
;
1
>
;
using
ThreadwiseStore
=
ThreadwiseGenericTensorSliceCopy_v4r2
<
Register
BufferDesc
,
using
ThreadwiseStore
=
ThreadwiseGenericTensorSliceCopy_v4r2
<
Thread
BufferDesc
,
DstDesc
,
Block
DstDesc
,
Sub
Lengths
,
ThreadSlice
Lengths
,
DstDimAccessOrder
,
DstDimAccessOrder
,
DstVectorAccessDim
,
DstVectorAccessDim
,
1
,
1
,
...
...
composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp
View file @
b3d4595f
...
@@ -1180,7 +1180,7 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
...
@@ -1180,7 +1180,7 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
// Will do padding check on src data: Read 0 if src data is in padding area.
// Will do padding check on src data: Read 0 if src data is in padding area.
// Will do padding check on dst data: No write if dst data is in paddin area.
// Will do padding check on dst data: No write if dst data is in paddin area.
template
<
typename
SrcData
,
template
<
typename
SrcData
,
typename
DstData
,
typename
DstData
,
address_space_t
SrcAddressSpace
=
address_space_t
::
generic
,
address_space_t
SrcAddressSpace
=
address_space_t
::
generic
,
address_space_t
DstAddressSpace
=
address_space_t
::
generic
>
address_space_t
DstAddressSpace
=
address_space_t
::
generic
>
__device__
void
Run_generic
(
const
SrcData
*
p_src
,
DstData
*
p_dst
)
const
__device__
void
Run_generic
(
const
SrcData
*
p_src
,
DstData
*
p_dst
)
const
...
@@ -1233,7 +1233,8 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
...
@@ -1233,7 +1233,8 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
static_if
<
SrcAddressSpace
==
address_space_t
::
global
>
{}([
&
](
auto
)
{
static_if
<
SrcAddressSpace
==
address_space_t
::
global
>
{}([
&
](
auto
)
{
#if CK_USE_AMD_INTRINSIC && CK_USE_AMD_INTRINSIC_BUFFER_LOAD_STORE
#if CK_USE_AMD_INTRINSIC && CK_USE_AMD_INTRINSIC_BUFFER_LOAD_STORE
*
reinterpret_cast
<
src_vector_t
*>
(
&
p_src_long_vector
[
buffer_offset
])
=
*
reinterpret_cast
<
src_vector_t
*>
(
&
p_src_long_vector
[
buffer_offset
])
=
__buffer_load
<
SrcData
,
SrcDataPerAccess
>
(
p_src
,
src_coord
.
GetOffset
(),
0
);
__buffer_load
<
SrcData
,
SrcDataPerAccess
>
(
p_src
,
src_coord
.
GetOffset
(),
0
);
#else
#else
*
reinterpret_cast
<
src_vector_t
*>
(
&
p_src_long_vector
[
buffer_offset
])
=
*
reinterpret_cast
<
src_vector_t
*>
(
&
p_src_long_vector
[
buffer_offset
])
=
*
reinterpret_cast
<
const
src_vector_t
*>
(
&
p_src
[
src_coord
.
GetOffset
()]);
*
reinterpret_cast
<
const
src_vector_t
*>
(
&
p_src
[
src_coord
.
GetOffset
()]);
...
@@ -1246,12 +1247,12 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
...
@@ -1246,12 +1247,12 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
}
}
}
}
// SrcData to DstData conversion
// SrcData to DstData conversion
DstData
p_dst_long_vector
[
long_vector_size
];
DstData
p_dst_long_vector
[
long_vector_size
];
for
(
index_t
i
=
0
;
i
<
long_vector_size
;
++
i
)
for
(
index_t
i
=
0
;
i
<
long_vector_size
;
++
i
)
{
{
p_dst_long_vector
[
i
]
=
type_convert
<
DstData
>
(
p_src_long_vector
[
i
]);
p_dst_long_vector
[
i
]
=
type_convert
<
DstData
>
{}
(
p_src_long_vector
[
i
]);
}
}
// store data from the long-vector buffer to dst
// store data from the long-vector buffer to dst
...
...
composable_kernel/include/utility/config_amd.hpp.in
View file @
b3d4595f
...
@@ -38,11 +38,11 @@ typedef float float4_t __attribute__((ext_vector_type(4)));
...
@@ -38,11 +38,11 @@ typedef float float4_t __attribute__((ext_vector_type(4)));
typedef int32_t int32x4_t __attribute__((ext_vector_type(4)));
typedef int32_t int32x4_t __attribute__((ext_vector_type(4)));
// data type conversion
// data type conversion
template <
class
T>
template <
typename
T>
struct type_convert
struct type_convert
{
{
template <
class
X>
template <
typename
X>
__device__ T operator()(
X
x) const
__device__ T operator()(
const X&
x) const
{
{
return static_cast<T>(x);
return static_cast<T>(x);
}
}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment