Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
3b07df08
Commit
3b07df08
authored
Jun 27, 2020
by
Chao Liu
Browse files
remove threadwise copy Run_optimized_src_address_calculation
parent
e371df51
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
10 additions
and
342 deletions
+10
-342
composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp
.../tensor_operation/blockwise_generic_tensor_slice_copy.hpp
+2
-25
composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp
...tensor_operation/threadwise_generic_tensor_slice_copy.hpp
+8
-317
No files found.
composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp
View file @
3b07df08
...
@@ -84,21 +84,10 @@ struct BlockwiseGenericTensorSliceCopy_v4
...
@@ -84,21 +84,10 @@ struct BlockwiseGenericTensorSliceCopy_v4
__device__
void
RunLoadThreadBuffer
(
const
BlockSrcData
*
p_block_src
,
__device__
void
RunLoadThreadBuffer
(
const
BlockSrcData
*
p_block_src
,
ThreadBufferData
*
p_thread_buffer
)
const
ThreadBufferData
*
p_thread_buffer
)
const
{
{
constexpr
bool
has_optimized_address_calculation
=
decltype
(
mThreadwiseStore
)
::
HasWorkingOptimizedAddressCalculation
();
if
(
BlockSize
==
mThreadClusterDesc
.
GetElementSize
()
or
if
(
BlockSize
==
mThreadClusterDesc
.
GetElementSize
()
or
get_thread_local_1d_id
()
<
mThreadClusterDesc
.
GetElementSize
())
get_thread_local_1d_id
()
<
mThreadClusterDesc
.
GetElementSize
())
{
{
// TODO: threadwise copy is still being tweaked
mThreadwiseLoad
.
Run
(
p_block_src
,
p_thread_buffer
);
if
(
has_optimized_address_calculation
)
{
mThreadwiseLoad
.
Run_optimized_src_address_calculation
(
p_block_src
,
p_thread_buffer
);
}
else
{
mThreadwiseLoad
.
Run
(
p_block_src
,
p_thread_buffer
);
}
}
}
}
}
...
@@ -106,22 +95,10 @@ struct BlockwiseGenericTensorSliceCopy_v4
...
@@ -106,22 +95,10 @@ struct BlockwiseGenericTensorSliceCopy_v4
__device__
void
RunStoreThreadBuffer
(
const
ThreadBufferData
*
p_thread_buffer
,
__device__
void
RunStoreThreadBuffer
(
const
ThreadBufferData
*
p_thread_buffer
,
BlockDstData
*
p_block_dst
)
const
BlockDstData
*
p_block_dst
)
const
{
{
constexpr
bool
has_optimized_address_calculation
=
decltype
(
mThreadwiseStore
)
::
HasWorkingOptimizedAddressCalculation
();
if
(
BlockSize
==
mThreadClusterDesc
.
GetElementSize
()
or
if
(
BlockSize
==
mThreadClusterDesc
.
GetElementSize
()
or
get_thread_local_1d_id
()
<
mThreadClusterDesc
.
GetElementSize
())
get_thread_local_1d_id
()
<
mThreadClusterDesc
.
GetElementSize
())
{
{
// TODO: threadwise copy is still being tweaked
mThreadwiseStore
.
Run
(
p_thread_buffer
,
p_block_dst
);
if
(
has_optimized_address_calculation
)
{
mThreadwiseStore
.
Run_optimized_dst_address_calculation
(
p_thread_buffer
,
p_block_dst
);
}
else
{
mThreadwiseStore
.
Run
(
p_thread_buffer
,
p_block_dst
);
}
}
}
}
}
...
...
composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp
View file @
3b07df08
...
@@ -93,11 +93,13 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
...
@@ -93,11 +93,13 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
// buffer to hold a src long-vector
// buffer to hold a src long-vector
SrcData
p_src_long_vector
[
long_vector_size
];
SrcData
p_src_long_vector
[
long_vector_size
];
#if 1
// zero out buffer
// zero out buffer
for
(
index_t
i
=
0
;
i
<
long_vector_size
;
++
i
)
for
(
index_t
i
=
0
;
i
<
long_vector_size
;
++
i
)
{
{
p_src_long_vector
[
i
]
=
0
;
p_src_long_vector
[
i
]
=
0
;
}
}
#endif
// load data from src to the long-vector buffer
// load data from src to the long-vector buffer
for
(
index_t
i
=
0
;
i
<
long_vector_size
/
src_data_per_access
;
++
i
)
for
(
index_t
i
=
0
;
i
<
long_vector_size
/
src_data_per_access
;
++
i
)
...
@@ -121,9 +123,11 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
...
@@ -121,9 +123,11 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
1
>
(
p_src
,
1
>
(
p_src
,
src_coord
.
GetOffset
(),
src_coord
.
GetOffset
(),
src_coord
.
IsOffsetValidAssumingUpperIndexIsValid
(),
src_coord
.
IsOffsetValidAssumingUpperIndexIsValid
(),
SrcDesc
::
GetElementSpace
(),
p_src_long_vector
,
p_src_long_vector
,
buffer_offset
,
buffer_offset
,
true
);
true
,
long_vector_size
);
}
}
// SrcData to DstData conversion
// SrcData to DstData conversion
...
@@ -156,328 +160,15 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
...
@@ -156,328 +160,15 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
DstDataStride
>
(
p_dst_long_vector
,
DstDataStride
>
(
p_dst_long_vector
,
buffer_offset
,
buffer_offset
,
true
,
true
,
long_vector_size
,
p_dst
,
p_dst
,
dst_coord
.
GetOffset
(),
dst_coord
.
GetOffset
(),
dst_coord
.
IsOffsetValidAssumingUpperIndexIsValid
());
dst_coord
.
IsOffsetValidAssumingUpperIndexIsValid
(),
DstDesc
::
GetElementSpace
());
}
}
});
});
}
}
// Modify Length to 1, if Mask is set to false
// Used for isolating linear dimension from non-linear dimensions
template
<
index_t
...
Lengths
,
index_t
...
Mask
>
__device__
static
constexpr
auto
mask_lengths
(
Sequence
<
Lengths
...
>
,
Sequence
<
Mask
...
>
)
{
return
Sequence
<
(
Mask
?
Lengths
:
1
)...
>
{};
}
// Will do valid mapping check on src data: Read 0 if src data has a invalid mapping
// Will do valid mapping check on dst data: No write if dst data has a invalid mapping
// This version is optimized for address calculation of src tensor
// TODO: this function is not compiled to expected ISA
template
<
typename
SrcData
,
typename
DstData
>
__device__
void
Run_optimized_src_address_calculation
(
const
SrcData
*
p_src
,
DstData
*
p_dst
)
const
{
constexpr
auto
vector_access_dim
=
Number
<
SrcDstVectorReadWriteDim
>
{};
constexpr
auto
src_data_per_access
=
Number
<
SrcDataPerRead
>
{};
constexpr
auto
dst_data_per_access
=
Number
<
DstDataPerWrite
>
{};
constexpr
auto
long_vector_size
=
Number
<
math
::
lcm
(
SrcDataPerRead
,
DstDataPerWrite
)
>
{};
constexpr
auto
long_vector_access_lengths
=
SliceLengths
::
Modify
(
vector_access_dim
,
SliceLengths
::
Get
(
vector_access_dim
)
/
long_vector_size
);
// separate linear dimensions from non-linear dimensions
constexpr
auto
src_linear_dim_mask
=
SrcDesc
::
GetLinearDimensionMask
();
constexpr
auto
src_nonlinear_dim_mask
=
SrcDesc
::
GetNonLinearDimensionMask
();
static_assert
(
src_linear_dim_mask
.
At
(
SrcDstVectorReadWriteDim
)
||
long_vector_size
==
SrcDataPerRead
,
"Warning! SrcDstVectorReadWriteDim is not SrcDesc's linear dimension, performance "
"would drop"
);
// separate steps into linear and non-linear components, accoording to src tensor
constexpr
auto
linear_long_vector_access_lengths
=
mask_lengths
(
long_vector_access_lengths
,
src_linear_dim_mask
);
constexpr
auto
nonlinear_long_vector_access_lengths
=
mask_lengths
(
long_vector_access_lengths
,
src_nonlinear_dim_mask
);
// loop over src's non-linear dimensions
ford
<
decltype
(
nonlinear_long_vector_access_lengths
)
>
{}([
&
](
auto
nonlinear_dim_long_vector_access_id
)
{
// calculate step-sizes along src's nonlinear dimensions
auto
nonlinear_dim_data_steps
=
nonlinear_dim_long_vector_access_id
;
nonlinear_dim_data_steps
(
vector_access_dim
)
=
long_vector_size
*
nonlinear_dim_long_vector_access_id
[
vector_access_dim
];
// move src cooridnate along nonlinear dimensions
// this coordinate contains run-time per-thread offset
const
auto
src_nonlinear_coord
=
mSrcSliceOrigin
+
nonlinear_dim_data_steps
;
// loop over src's linear dimensions
ford
<
decltype
(
linear_long_vector_access_lengths
)
>
{}([
&
](
auto
linear_dim_long_vector_access_id
)
{
// step-sizes along src's linear dimensions
auto
linear_dim_data_steps
=
linear_dim_long_vector_access_id
;
linear_dim_data_steps
(
vector_access_dim
)
=
long_vector_size
*
linear_dim_long_vector_access_id
[
vector_access_dim
];
// buffer to hold a long-vector
SrcData
p_src_long_vector
[
long_vector_size
];
// zero out buffer
for
(
index_t
i
=
0
;
i
<
long_vector_size
;
++
i
)
{
p_src_long_vector
[
i
]
=
0
;
}
// Loop over SrcDstVectorReadWriteDim, and load data from src to the
// long-vector buffer.
// If SrcDstVectorReadWriteDim is src's linear dimension, then src's
// offset-diff due to this looping is known at compile-time. If
// SrcDstVectorReadWriteDim is src's nonlinear dimension, then src's
// offset-diff due to this looping is only known at run-time. For best
// performance, SrcDstVectorReadWriteDim, should be src's linear dimension
for
(
index_t
i
=
0
;
i
<
long_vector_size
/
src_data_per_access
;
++
i
)
{
auto
scalar_id
=
make_zero_array
<
index_t
,
nDim
>
();
scalar_id
(
vector_access_dim
)
=
i
*
src_data_per_access
;
const
index_t
buffer_offset
=
i
*
src_data_per_access
;
// move src cooridnate along linear dimensions
const
auto
src_coord
=
src_nonlinear_coord
+
(
linear_dim_data_steps
+
scalar_id
);
#if CK_EXPERIMENTAL_TENSOR_COORDINATE_USE_CALCULATE_OFFSET_DIFF // tweaking
// this is src compile-time offset
const
index_t
src_linear_offset
=
src_nonlinear_coord
.
CalculateOffsetDiff
(
linear_dim_data_steps
+
scalar_id
);
#else
// this is src compile-time offset
const
index_t
src_linear_offset
=
src_coord
.
GetOffset
()
-
src_nonlinear_coord
.
GetOffset
();
#endif
// Check src data's valid mapping situation, only check the first data in this
// src
// vector. It's user's responsiblity to make sure all data in the src vector
// has the valid/invalid mapping situation
transfer_data
<
SrcData
,
SrcDataPerRead
,
SrcAddressSpace
,
AddressSpace
::
Vgpr
,
InMemoryDataOperation
::
Set
>
(
p_src
,
src_nonlinear_coord
.
GetOffset
()
+
src_linear_offset
,
src_coord
.
IsOffsetValidAssumingUpperIndexIsValid
(),
p_src_long_vector
,
buffer_offset
,
true
);
}
// SrcData to DstData conversion
DstData
p_dst_long_vector
[
long_vector_size
];
for
(
index_t
i
=
0
;
i
<
long_vector_size
;
++
i
)
{
p_dst_long_vector
[
i
]
=
type_convert
<
DstData
>
{}(
p_src_long_vector
[
i
]);
}
// store data from the long-vector buffer to dst
for
(
index_t
i
=
0
;
i
<
long_vector_size
/
dst_data_per_access
;
++
i
)
{
auto
scalar_id
=
make_zero_array
<
index_t
,
nDim
>
();
scalar_id
(
vector_access_dim
)
=
i
*
dst_data_per_access
;
const
index_t
buffer_offset
=
i
*
dst_data_per_access
;
// dst offset is calculated here, without explicitly separating into
// compile-time and per-thread component
const
auto
dst_coord
=
mDstSliceOrigin
+
(
nonlinear_dim_data_steps
+
linear_dim_data_steps
+
scalar_id
);
// Check dst data's valid mapping situation, only check the first data in this
// dst
// vector. It's user's responsiblity to make sure all data in the dst vector
// has the valid/invalid mapping situation
transfer_data
<
DstData
,
DstDataPerWrite
,
AddressSpace
::
Vgpr
,
DstAddressSpace
,
DstInMemOp
>
(
p_dst_long_vector
,
buffer_offset
,
true
,
p_dst
,
dst_coord
.
GetOffset
(),
dst_coord
.
IsOffsetValidAssumingUpperIndexIsValid
());
}
});
});
}
// This version is optimized for address calculation of dst tensor
// TODO: this function is not compiled to expected ISA
template
<
typename
SrcData
,
typename
DstData
>
__device__
void
Run_optimized_dst_address_calculation
(
const
SrcData
*
p_src
,
DstData
*
p_dst
)
const
{
constexpr
auto
vector_access_dim
=
Number
<
SrcDstVectorReadWriteDim
>
{};
constexpr
auto
src_data_per_access
=
Number
<
SrcDataPerRead
>
{};
constexpr
auto
dst_data_per_access
=
Number
<
DstDataPerWrite
>
{};
constexpr
auto
long_vector_size
=
Number
<
math
::
lcm
(
SrcDataPerRead
,
DstDataPerWrite
)
>
{};
constexpr
auto
long_vector_access_lengths
=
SliceLengths
::
Modify
(
vector_access_dim
,
SliceLengths
::
Get
(
vector_access_dim
)
/
long_vector_size
);
// separate linear dimensions from non-linear dimensions
constexpr
auto
dst_linear_dim_mask
=
DstDesc
::
GetLinearDimensionMask
();
constexpr
auto
dst_nonlinear_dim_mask
=
DstDesc
::
GetNonLinearDimensionMask
();
static_assert
(
dst_linear_dim_mask
.
At
(
SrcDstVectorReadWriteDim
)
||
long_vector_size
==
DstDataPerWrite
,
"Warning! SrcDstVectorReadWriteDim is not DstDesc's linear dimension, performance "
"would drop"
);
// separate steps into linear and non-linear components, accoording to dst tensor
constexpr
auto
linear_long_vector_access_lengths
=
mask_lengths
(
long_vector_access_lengths
,
dst_linear_dim_mask
);
constexpr
auto
nonlinear_long_vector_access_lengths
=
mask_lengths
(
long_vector_access_lengths
,
dst_nonlinear_dim_mask
);
// loop over dst's non-linear dimensions
ford
<
decltype
(
nonlinear_long_vector_access_lengths
)
>
{}([
&
](
auto
nonlinear_dim_long_vector_access_id
)
{
// calculate step-sizes along dst's nonlinear dimensions
auto
nonlinear_dim_data_steps
=
nonlinear_dim_long_vector_access_id
;
nonlinear_dim_data_steps
(
vector_access_dim
)
=
long_vector_size
*
nonlinear_dim_long_vector_access_id
[
vector_access_dim
];
// move dst cooridnate along nonlinear dimensions
// this coordinate contains run-time per-thread offset
const
auto
dst_nonlinear_coord
=
mDstSliceOrigin
+
nonlinear_dim_data_steps
;
// loop over dst's linear dimensions
ford
<
decltype
(
linear_long_vector_access_lengths
)
>
{}([
&
](
auto
linear_dim_long_vector_access_id
)
{
// step-sizes along dst's linear dimensions
auto
linear_dim_data_steps
=
linear_dim_long_vector_access_id
;
linear_dim_data_steps
(
vector_access_dim
)
=
long_vector_size
*
linear_dim_long_vector_access_id
[
vector_access_dim
];
// buffer to hold a long-vector
SrcData
p_src_long_vector
[
long_vector_size
];
// zero out buffer
for
(
index_t
i
=
0
;
i
<
long_vector_size
;
++
i
)
{
p_src_long_vector
[
i
]
=
0
;
}
// Loop over SrcDstVectorReadWriteDim, and load data from src to the
// long-vector buffer.
// If SrcDstVectorReadWriteDim is dst's linear dimension, then dst's
// offset-diff due to this looping is known at compile-time. If
// SrcDstVectorReadWriteDim is dst's nonlinear dimension, then dst's
// offset-diff due to this looping is only known at run-time. For best
// performance, SrcDstVectorReadWriteDim, should be dst's linear dimension
for
(
index_t
i
=
0
;
i
<
long_vector_size
/
src_data_per_access
;
++
i
)
{
auto
scalar_id
=
make_zero_array
<
index_t
,
nDim
>
();
scalar_id
(
vector_access_dim
)
=
i
*
src_data_per_access
;
const
index_t
buffer_offset
=
i
*
src_data_per_access
;
// src offset is calculated here, without explicitly separating into
// compile-time and per-thread component
const
auto
src_coord
=
mSrcSliceOrigin
+
(
nonlinear_dim_data_steps
+
linear_dim_data_steps
+
scalar_id
);
// Check src data's valid mapping situation, only check the first data in this
// src
// vector. It's user's responsiblity to make sure all data in the src vector
// has the valid/invalid mapping situation
transfer_data
<
SrcData
,
SrcDataPerRead
,
SrcAddressSpace
,
AddressSpace
::
Vgpr
,
InMemoryDataOperation
::
Set
>
(
p_src
,
src_coord
.
GetOffset
(),
src_coord
.
IsOffsetValidAssumingUpperIndexIsValid
(),
p_src_long_vector
,
buffer_offset
,
true
);
}
// SrcData to DstData conversion
DstData
p_dst_long_vector
[
long_vector_size
];
for
(
index_t
i
=
0
;
i
<
long_vector_size
;
++
i
)
{
p_dst_long_vector
[
i
]
=
type_convert
<
DstData
>
{}(
p_src_long_vector
[
i
]);
}
// store data from the long-vector buffer to dst
for
(
index_t
i
=
0
;
i
<
long_vector_size
/
dst_data_per_access
;
++
i
)
{
auto
scalar_id
=
make_zero_array
<
index_t
,
nDim
>
();
scalar_id
(
vector_access_dim
)
=
i
*
dst_data_per_access
;
const
index_t
buffer_offset
=
i
*
dst_data_per_access
;
// move dst cooridnate along linear dimensions
const
auto
dst_coord
=
dst_nonlinear_coord
+
(
linear_dim_data_steps
+
scalar_id
);
#if CK_EXPERIMENTAL_TENSOR_COORDINATE_USE_CALCULATE_OFFSET_DIFF // tweaking
// this is dst compile-time offset
const
index_t
dst_linear_offset
=
dst_nonlinear_coord
.
CalculateOffsetDiff
(
linear_dim_data_steps
+
scalar_id
);
#else
// this is dst compile-time offset
const
index_t
dst_linear_offset
=
dst_coord
.
GetOffset
()
-
dst_nonlinear_coord
.
GetOffset
();
#endif
// Check dst data's valid mapping situation, only check the first data in this
// dst
// vector. It's user's responsiblity to make sure all data in the dst vector
// has the valid/invalid mapping situation
transfer_data
<
DstData
,
DstDataPerWrite
,
AddressSpace
::
Vgpr
,
DstAddressSpace
,
DstInMemOp
>
(
p_dst_long_vector
,
buffer_offset
,
true
,
p_dst
,
dst_nonlinear_coord
.
GetOffset
()
+
dst_linear_offset
,
dst_coord
.
IsOffsetValidAssumingUpperIndexIsValid
());
}
});
});
}
__device__
static
constexpr
bool
HasWorkingOptimizedAddressCalculation
()
{
#if CK_EXPERIMENTAL_THREADWISE_COPY_V4R2_USE_OPTIMIZED_ADDRESS_CACLULATION // tweaking
return
true
;
#else
return
false
;
#endif
}
template
<
typename
T
,
bool
PositiveDirection
>
template
<
typename
T
,
bool
PositiveDirection
>
__device__
void
MoveSrcSliceWindow
(
const
T
&
step_sizes_
,
__device__
void
MoveSrcSliceWindow
(
const
T
&
step_sizes_
,
integral_constant
<
bool
,
PositiveDirection
>
)
integral_constant
<
bool
,
PositiveDirection
>
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment