Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel_ROCM
Commits
0f52c4c0
Commit
0f52c4c0
authored
Sep 26, 2019
by
Chao Liu
Browse files
added type conversion in threadwise and blockwise copy
parent
b3d4595f
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
58 additions
and
33 deletions
+58
-33
composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp
...cit_gemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp
+2
-1
composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp
.../tensor_operation/blockwise_generic_tensor_slice_copy.hpp
+2
-2
composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp
...tensor_operation/threadwise_generic_tensor_slice_copy.hpp
+54
-30
No files found.
composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp
View file @
0f52c4c0
...
...
@@ -437,9 +437,10 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buf
Float
,
address_space_t
::
generic
,
address_space_t
::
global
>
#el
if 1
#el
se // tweaking
.
template
Run_optimized_dst_address_calculation
<
Float
,
Float
,
address_space_t
::
generic
,
address_space_t
::
global
>
#endif
(
p_out_thread
,
p_out_global
);
...
...
composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp
View file @
0f52c4c0
...
...
@@ -772,7 +772,7 @@ struct BlockwiseGenericTensorSliceCopy_v4
BlockSrcAddressSpace
,
ThreadBufferAddressSpace
>(
p_block_src
,
p_thread_buffer
);
#else
#else
// tweaking
mThreadwiseLoad
.
template
Run_optimized_src_address_calculation
<
BlockSrcData
,
ThreadBufferData
,
BlockSrcAddressSpace
,
...
...
@@ -793,7 +793,7 @@ struct BlockwiseGenericTensorSliceCopy_v4
BlockDstData
,
ThreadBufferAddressSpace
,
BlockDstAddressSpace
>(
p_thread_buffer
,
p_block_dst
);
#else
#else
// tweaking
mThreadwiseStore
.
template
Run_optimized_dst_address_calculation
<
ThreadBufferData
,
BlockDstData
,
ThreadBufferAddressSpace
,
...
...
composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp
View file @
0f52c4c0
...
...
@@ -1226,7 +1226,8 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
const
auto
src_coord
=
mSrcSliceOrigin
+
(
long_vector_data_begin_id
+
scalar_id
);
// Check src vector's padding situation, only check the first data in this src
// vector. It's user's responsiblity to make sure all data in the src vector has
// vector. It's user's responsiblity to make sure all data in the src vector
// has
// the same padding situation
if
(
src_coord
.
IsUpperIndexMappedToValidOffset
())
{
...
...
@@ -1266,7 +1267,8 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
const
auto
dst_coord
=
mDstSliceOrigin
+
(
long_vector_data_begin_id
+
scalar_id
);
// Check dst vector's padding situation, only check the first data in this dst
// vector. It's user's responsiblity to make sure all data in the dst vector has
// vector. It's user's responsiblity to make sure all data in the dst vector
// has
// the same padding situation
if
(
dst_coord
.
IsUpperIndexMappedToValidOffset
())
{
...
...
@@ -1305,13 +1307,15 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
// Will do padding check on src data: Read 0 if src data is in padding area.
// Will do padding check on dst data: No write if dst data is in paddin area.
// This version is optimized for address calculation of src tensor
template
<
typename
TData
,
template
<
typename
SrcData
,
typename
DstData
,
address_space_t
SrcAddressSpace
=
address_space_t
::
generic
,
address_space_t
DstAddressSpace
=
address_space_t
::
generic
>
__device__
void
Run_optimized_src_address_calculation
(
const
TData
*
p_src
,
TData
*
p_dst
)
const
__device__
void
Run_optimized_src_address_calculation
(
const
SrcData
*
p_src
,
DstData
*
p_dst
)
const
{
using
src_vector_t
=
typename
vector_type
<
T
Data
,
SrcDataPerAccess
>::
MemoryType
;
using
dst_vector_t
=
typename
vector_type
<
T
Data
,
DstDataPerAccess
>::
MemoryType
;
using
src_vector_t
=
typename
vector_type
<
Src
Data
,
SrcDataPerAccess
>::
MemoryType
;
using
dst_vector_t
=
typename
vector_type
<
Dst
Data
,
DstDataPerAccess
>::
MemoryType
;
constexpr
auto
vector_access_dim
=
Number
<
VectorAccessDim
>
{};
...
...
@@ -1327,9 +1331,10 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
constexpr
auto
src_linear_dim_mask
=
SrcDesc
::
GetLinearDimensionMask
();
constexpr
auto
src_nonlinear_dim_mask
=
SrcDesc
::
GetNonLinearDimensionMask
();
static_assert
(
src_linear_dim_mask
.
At
(
VectorAccessDim
)
||
long_vector_size
==
SrcDataPerAccess
,
"Warning! VectorAccessDim is not SrcDesc's linear dimension, performance would drop"
);
static_assert
(
src_linear_dim_mask
.
At
(
VectorAccessDim
)
||
long_vector_size
==
SrcDataPerAccess
,
"Warning! VectorAccessDim is not SrcDesc's linear dimension, performance "
"would drop"
);
// separate steps into linear and non-linear components, accoording to src tensor
constexpr
auto
linear_long_vector_access_lengths
=
...
...
@@ -1361,12 +1366,12 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
long_vector_size
*
linear_dim_long_vector_access_id
[
vector_access_dim
];
// buffer to hold a long-vector
T
Data
p_long_vector
[
long_vector_size
];
Src
Data
p_
src_
long_vector
[
long_vector_size
];
// zero out buffer
for
(
index_t
i
=
0
;
i
<
long_vector_size
;
++
i
)
{
p_long_vector
[
i
]
=
0
;
p_
src_
long_vector
[
i
]
=
0
;
}
// Loop over VectorAccessDim, and load data from src to the
...
...
@@ -1399,22 +1404,30 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
{
static_if
<
SrcAddressSpace
==
address_space_t
::
global
>
{}([
&
](
auto
)
{
#if CK_USE_AMD_INTRINSIC && CK_USE_AMD_INTRINSIC_BUFFER_LOAD_STORE
*
reinterpret_cast
<
src_vector_t
*>
(
&
p_long_vector
[
buffer_offset
])
=
__buffer_load
<
T
Data
,
SrcDataPerAccess
>
(
*
reinterpret_cast
<
src_vector_t
*>
(
&
p_
src_
long_vector
[
buffer_offset
])
=
__buffer_load
<
Src
Data
,
SrcDataPerAccess
>
(
p_src
,
src_nonlinear_coord
.
GetOffset
(),
src_linear_offset
);
#else
*
reinterpret_cast
<
src_vector_t
*>
(
&
p_long_vector
[
buffer_offset
])
=
*
reinterpret_cast
<
src_vector_t
*>
(
&
p_
src_
long_vector
[
buffer_offset
])
=
*
reinterpret_cast
<
const
src_vector_t
*>
(
&
p_src
[
src_nonlinear_coord
.
GetOffset
()
+
src_linear_offset
]);
#endif
}).
Else
([
&
](
auto
)
{
*
reinterpret_cast
<
src_vector_t
*>
(
&
p_long_vector
[
buffer_offset
])
=
*
reinterpret_cast
<
src_vector_t
*>
(
&
p_
src_
long_vector
[
buffer_offset
])
=
*
reinterpret_cast
<
const
src_vector_t
*>
(
&
p_src
[
src_nonlinear_coord
.
GetOffset
()
+
src_linear_offset
]);
});
}
}
// SrcData to DstData conversion
DstData
p_dst_long_vector
[
long_vector_size
];
for
(
index_t
i
=
0
;
i
<
long_vector_size
;
++
i
)
{
p_dst_long_vector
[
i
]
=
type_convert
<
DstData
>
{}(
p_src_long_vector
[
i
]);
}
// store data from the long-vector buffer to dst
for
(
index_t
i
=
0
;
i
<
long_vector_size
/
dst_data_per_access
;
++
i
)
{
...
...
@@ -1434,7 +1447,7 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
if
(
dst_coord
.
IsUpperIndexMappedToValidOffset
())
{
*
reinterpret_cast
<
dst_vector_t
*>
(
&
p_dst
[
dst_coord
.
GetOffset
()])
=
*
reinterpret_cast
<
dst_vector_t
*>
(
&
p_long_vector
[
buffer_offset
]);
*
reinterpret_cast
<
dst_vector_t
*>
(
&
p_
dst_
long_vector
[
buffer_offset
]);
}
}
});
...
...
@@ -1447,13 +1460,15 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
// Will do padding check on src data: Read 0 if src data is in padding area.
// Will do padding check on dst data: No write if dst data is in paddin area.
// This version is optimized for address calculation of dst tensor
template
<
typename
TData
,
template
<
typename
SrcData
,
typename
DstData
,
address_space_t
SrcAddressSpace
=
address_space_t
::
generic
,
address_space_t
DstAddressSpace
=
address_space_t
::
generic
>
__device__
void
Run_optimized_dst_address_calculation
(
const
TData
*
p_src
,
TData
*
p_dst
)
const
__device__
void
Run_optimized_dst_address_calculation
(
const
SrcData
*
p_src
,
DstData
*
p_dst
)
const
{
using
src_vector_t
=
typename
vector_type
<
T
Data
,
SrcDataPerAccess
>::
MemoryType
;
using
dst_vector_t
=
typename
vector_type
<
T
Data
,
DstDataPerAccess
>::
MemoryType
;
using
src_vector_t
=
typename
vector_type
<
Src
Data
,
SrcDataPerAccess
>::
MemoryType
;
using
dst_vector_t
=
typename
vector_type
<
Dst
Data
,
DstDataPerAccess
>::
MemoryType
;
constexpr
auto
vector_access_dim
=
Number
<
VectorAccessDim
>
{};
...
...
@@ -1469,9 +1484,10 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
constexpr
auto
dst_linear_dim_mask
=
DstDesc
::
GetLinearDimensionMask
();
constexpr
auto
dst_nonlinear_dim_mask
=
DstDesc
::
GetNonLinearDimensionMask
();
static_assert
(
dst_linear_dim_mask
.
At
(
VectorAccessDim
)
||
long_vector_size
==
DstDataPerAccess
,
"Warning! VectorAccessDim is not DstDesc's linear dimension, performance would drop"
);
static_assert
(
dst_linear_dim_mask
.
At
(
VectorAccessDim
)
||
long_vector_size
==
DstDataPerAccess
,
"Warning! VectorAccessDim is not DstDesc's linear dimension, performance "
"would drop"
);
// separate steps into linear and non-linear components, accoording to dst tensor
constexpr
auto
linear_long_vector_access_lengths
=
...
...
@@ -1503,12 +1519,12 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
long_vector_size
*
linear_dim_long_vector_access_id
[
vector_access_dim
];
// buffer to hold a long-vector
T
Data
p_long_vector
[
long_vector_size
];
Src
Data
p_
src_
long_vector
[
long_vector_size
];
// zero out buffer
for
(
index_t
i
=
0
;
i
<
long_vector_size
;
++
i
)
{
p_long_vector
[
i
]
=
0
;
p_
src_
long_vector
[
i
]
=
0
;
}
// Loop over VectorAccessDim, and load data from src to the
...
...
@@ -1535,11 +1551,19 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
// the src vector has the same padding situation
if
(
src_coord
.
IsUpperIndexMappedToValidOffset
())
{
*
reinterpret_cast
<
src_vector_t
*>
(
&
p_long_vector
[
buffer_offset
])
=
*
reinterpret_cast
<
src_vector_t
*>
(
&
p_
src_
long_vector
[
buffer_offset
])
=
*
reinterpret_cast
<
const
src_vector_t
*>
(
&
p_src
[
src_coord
.
GetOffset
()]);
}
}
// SrcData to DstData conversion
DstData
p_dst_long_vector
[
long_vector_size
];
for
(
index_t
i
=
0
;
i
<
long_vector_size
;
++
i
)
{
p_dst_long_vector
[
i
]
=
type_convert
<
DstData
>
{}(
p_src_long_vector
[
i
]);
}
// store data from the long-vector buffer to dst
for
(
index_t
i
=
0
;
i
<
long_vector_size
/
dst_data_per_access
;
++
i
)
{
...
...
@@ -1564,20 +1588,20 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
{
static_if
<
DstAddressSpace
==
address_space_t
::
global
>
{}([
&
](
auto
)
{
#if CK_USE_AMD_INTRINSIC && CK_USE_AMD_INTRINSIC_BUFFER_LOAD_STORE
__buffer_store
<
T
Data
,
DstDataPerAccess
>
(
*
reinterpret_cast
<
dst_vector_t
*>
(
&
p_long_vector
[
buffer_offset
]),
__buffer_store
<
Dst
Data
,
DstDataPerAccess
>
(
*
reinterpret_cast
<
dst_vector_t
*>
(
&
p_
dst_
long_vector
[
buffer_offset
]),
p_dst
,
dst_nonlinear_coord
.
GetOffset
(),
dst_linear_offset
);
#else
*
reinterpret_cast
<
dst_vector_t
*>
(
&
p_dst
[
dst_nonlinear_coord
.
GetOffset
()
+
dst_linear_offset
])
=
*
reinterpret_cast
<
dst_vector_t
*>
(
&
p_long_vector
[
buffer_offset
]);
*
reinterpret_cast
<
dst_vector_t
*>
(
&
p_
dst_
long_vector
[
buffer_offset
]);
#endif
}).
Else
([
&
](
auto
)
{
*
reinterpret_cast
<
dst_vector_t
*>
(
&
p_dst
[
dst_nonlinear_coord
.
GetOffset
()
+
dst_linear_offset
])
=
*
reinterpret_cast
<
dst_vector_t
*>
(
&
p_long_vector
[
buffer_offset
]);
*
reinterpret_cast
<
dst_vector_t
*>
(
&
p_
dst_
long_vector
[
buffer_offset
]);
});
}
}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment