Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
77c81617
Commit
77c81617
authored
Jan 16, 2021
by
Chao Liu
Browse files
improving index calculation
parent
f2f35201
Changes
11
Show whitespace changes
Inline
Side-by-side
Showing
11 changed files
with
1061 additions
and
459 deletions
+1061
-459
composable_kernel/include/driver/driver_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
...convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
+15
-11
composable_kernel/include/tensor_description/dynamic_multi_index_transform.hpp
...lude/tensor_description/dynamic_multi_index_transform.hpp
+550
-14
composable_kernel/include/tensor_operation/blockwise_dynamic_tensor_slice_transfer.hpp
...sor_operation/blockwise_dynamic_tensor_slice_transfer.hpp
+18
-141
composable_kernel/include/tensor_operation/gridwise_dynamic_gemm.hpp
...kernel/include/tensor_operation/gridwise_dynamic_gemm.hpp
+38
-32
composable_kernel/include/tensor_operation/threadwise_dynamic_tensor_slice_transfer.hpp
...or_operation/threadwise_dynamic_tensor_slice_transfer.hpp
+350
-251
composable_kernel/include/utility/amd_buffer_addressing.hpp
composable_kernel/include/utility/amd_buffer_addressing.hpp
+5
-0
composable_kernel/include/utility/config.amd.hpp.in
composable_kernel/include/utility/config.amd.hpp.in
+1
-1
composable_kernel/include/utility/sequence.hpp
composable_kernel/include/utility/sequence.hpp
+7
-0
driver/include/device_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
...convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
+35
-0
driver/include/device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
...convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
+40
-7
driver/src/conv_driver.cpp
driver/src/conv_driver.cpp
+2
-2
No files found.
composable_kernel/include/driver/driver_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
View file @
77c81617
...
@@ -34,7 +34,7 @@ template <index_t BlockSize,
...
@@ -34,7 +34,7 @@ template <index_t BlockSize,
index_t
GemmBBlockTransferSrcScalarPerVector_GemmN
,
index_t
GemmBBlockTransferSrcScalarPerVector_GemmN
,
index_t
GemmBBlockTransferDstScalarPerVector_GemmN
,
index_t
GemmBBlockTransferDstScalarPerVector_GemmN
,
index_t
GemmCThreadTransferDstScalarPerVector_GemmN1
>
index_t
GemmCThreadTransferDstScalarPerVector_GemmN1
>
struct
DriverDynamicConvolutionForwardImplicitGemm_v4r4_nchw_kcyx_nkhw
struct
DriverDynamicConvolutionForwardImplicitGemm_v4r4_nchw_kcyx_nkhw
_pad
{
{
template
<
typename
...
Wei
,
typename
...
In
,
typename
...
Out
>
template
<
typename
...
Wei
,
typename
...
In
,
typename
...
Out
>
__host__
void
Run
(
const
DynamicTensorDescriptor
<
Wei
...
>&
wei_k_c_y_x_global_desc
,
__host__
void
Run
(
const
DynamicTensorDescriptor
<
Wei
...
>&
wei_k_c_y_x_global_desc
,
...
@@ -96,18 +96,11 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v4r4_nchw_kcyx_nkhw
...
@@ -96,18 +96,11 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v4r4_nchw_kcyx_nkhw
// input tensor
// input tensor
const
auto
in_n_c_hip_wip_global_desc
=
transform_dynamic_tensor_descriptor
(
const
auto
in_n_c_hip_wip_global_desc
=
transform_dynamic_tensor_descriptor
(
transform_dynamic_tensor_descriptor
(
in_n_c_hi_wi_global_desc
,
in_n_c_hi_wi_global_desc
,
make_tuple
(
DynamicPassThrough
{
N
},
make_tuple
(
DynamicPassThrough
{
N
},
DynamicPassThrough
{
C
},
DynamicPassThrough
{
C
},
DynamicLeftPad
{
Hi
,
InLeftPadH
},
DynamicPad
{
Hi
,
InLeftPadH
,
InRightPadH
},
DynamicLeftPad
{
Wi
,
InLeftPadW
}),
DynamicPad
{
Wi
,
InLeftPadW
,
InRightPadW
}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{})),
make_tuple
(
DynamicPassThrough
{
N
},
DynamicPassThrough
{
C
},
DynamicRightPad
{
Hi
+
InLeftPadH
,
InRightPadH
},
DynamicRightPad
{
Wi
+
InLeftPadW
,
InRightPadW
}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{}));
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{}));
...
@@ -164,6 +157,7 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v4r4_nchw_kcyx_nkhw
...
@@ -164,6 +157,7 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v4r4_nchw_kcyx_nkhw
const
index_t
GemmM0
=
GemmM
/
GemmM1
;
const
index_t
GemmM0
=
GemmM
/
GemmM1
;
const
index_t
GemmN0
=
GemmN
/
GemmN1
;
const
index_t
GemmN0
=
GemmN
/
GemmN1
;
#if 1 // debug
const
auto
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc
=
const
auto
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc
=
transform_dynamic_tensor_descriptor
(
transform_dynamic_tensor_descriptor
(
out_gemmm_gemmn_global_desc
,
out_gemmm_gemmn_global_desc
,
...
@@ -171,6 +165,16 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v4r4_nchw_kcyx_nkhw
...
@@ -171,6 +165,16 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v4r4_nchw_kcyx_nkhw
DynamicUnMerge
<
2
>
{
make_multi_index
(
GemmN0
,
GemmN1
)}),
DynamicUnMerge
<
2
>
{
make_multi_index
(
GemmN0
,
GemmN1
)}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}),
make_tuple
(
Sequence
<
0
,
1
>
{},
Sequence
<
2
,
3
>
{}));
make_tuple
(
Sequence
<
0
,
1
>
{},
Sequence
<
2
,
3
>
{}));
#else
const
auto
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc
=
transform_dynamic_tensor_descriptor
(
out_gemmm_gemmn_global_desc
,
make_tuple
(
HackSemiDynamicUnMerge
<
3
,
Sequence
<
GemmM1
>>
{
make_multi_index
(
1
,
GemmM0
)},
HackSemiDynamicUnMerge
<
3
,
Sequence
<
GemmN1
>>
{
make_multi_index
(
1
,
GemmN0
)}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}),
make_tuple
(
Sequence
<
0
,
1
>
{},
Sequence
<
2
,
3
>
{}));
#endif
// GEMM
// GEMM
using
gridwise_gemm
=
GridwiseDynamicGemm_km_kn_mn_v1
<
using
gridwise_gemm
=
GridwiseDynamicGemm_km_kn_mn_v1
<
...
...
composable_kernel/include/tensor_description/dynamic_multi_index_transform.hpp
View file @
77c81617
...
@@ -93,6 +93,103 @@ struct DynamicPassThrough
...
@@ -93,6 +93,103 @@ struct DynamicPassThrough
}
}
};
};
template
<
bool
SkipIsValidCheck
=
false
>
struct
DynamicPad
{
using
LowerIndex
=
MultiIndex
<
1
>
;
using
UpperIndex
=
MultiIndex
<
1
>
;
const
UpperIndex
up_lengths_
;
const
index_t
left_pad_
;
const
index_t
right_pad_
;
#if 0
__host__ __device__ constexpr DynamicPad(const DynamicPad&) = default;
__host__ __device__ constexpr DynamicPad(DynamicPad&&) = default;
#else
__host__
__device__
constexpr
DynamicPad
(
const
DynamicPad
&
other
)
:
up_lengths_
{
other
.
up_lengths_
},
left_pad_
{
other
.
left_pad_
},
right_pad_
{
other
.
right_pad_
}
{
}
__host__
__device__
constexpr
DynamicPad
(
DynamicPad
&&
other
)
:
up_lengths_
{
other
.
up_lengths_
},
left_pad_
{
other
.
left_pad_
},
right_pad_
{
other
.
right_pad_
}
{
}
#endif
__host__
__device__
constexpr
DynamicPad
(
const
index_t
&
low_length
,
const
index_t
&
left_pad
,
const
index_t
&
right_pad
)
:
up_lengths_
{
make_multi_index
(
low_length
+
left_pad
+
right_pad
)},
left_pad_
{
left_pad
},
right_pad_
{
right_pad
}
{
}
__host__
__device__
constexpr
DynamicPad
()
:
up_lengths_
{
0
},
left_pad_
{
0
},
right_pad_
{
0
}
{}
__host__
__device__
static
constexpr
index_t
GetNumOfLowerDimension
()
{
return
1
;
}
__host__
__device__
static
constexpr
index_t
GetNumOfUpperDimension
()
{
return
1
;
}
__host__
__device__
constexpr
const
auto
&
GetUpperLengths
()
const
{
return
up_lengths_
;
}
template
<
typename
LowIdx
,
typename
UpIdx
>
__host__
__device__
constexpr
void
CalculateLowerIndex
(
LowIdx
&
idx_low
,
const
UpIdx
&
idx_up
)
const
{
static_assert
(
LowIdx
::
Size
()
==
1
&&
UpIdx
::
Size
()
==
1
,
"wrong! inconsistent # of dimension"
);
idx_low
(
Number
<
0
>
{})
=
idx_up
[
Number
<
0
>
{}]
-
left_pad_
;
}
template
<
typename
LowIdxDiff
,
typename
UpIdxDiff
,
typename
LowIdx
,
typename
UpIdx
>
__host__
__device__
static
constexpr
void
CalculateLowerIndexDiff
(
LowIdxDiff
&
idx_diff_low
,
const
UpIdxDiff
&
idx_diff_up
,
const
LowIdx
&
/* idx_low_old */
,
const
UpIdx
&
/* idx_up_old */
)
{
static_assert
(
LowIdxDiff
::
Size
()
==
1
&&
UpIdxDiff
::
Size
()
==
1
&&
LowIdx
::
Size
()
==
1
&&
UpIdx
::
Size
()
==
1
,
"wrong! inconsistent # of dimension"
);
idx_diff_low
(
Number
<
0
>
{})
=
idx_diff_up
[
Number
<
0
>
{}];
}
template
<
typename
LowIdxDiff
,
typename
UpIdxDiff
,
typename
LowIdx
,
typename
UpIdx
,
index_t
Hack
>
__host__
__device__
static
void
CalculateLowerIndexDiff_hack
(
LowIdxDiff
&
idx_diff_low
,
const
UpIdxDiff
&
idx_diff_up
,
const
LowIdx
&
idx_low_old
,
const
UpIdx
&
idx_up_old
,
Number
<
Hack
>
)
{
CalculateLowerIndexDiff
(
idx_diff_low
,
idx_diff_up
,
idx_low_old
,
idx_up_old
);
}
__host__
__device__
static
constexpr
bool
IsLinearTransform
()
{
return
true
;
}
__host__
__device__
static
constexpr
bool
IsValidUpperIndexAlwaysMappedToValidLowerIndex
()
{
return
SkipIsValidCheck
;
}
template
<
typename
UpIdx
>
__host__
__device__
constexpr
bool
IsValidUpperIndexMappedToValidLowerIndex
(
const
UpIdx
&
idx_up
)
const
{
return
SkipIsValidCheck
||
((
idx_up
[
Number
<
0
>
{}]
>=
left_pad_
)
&&
(
idx_up
[
Number
<
0
>
{}]
<
up_lengths_
[
Number
<
0
>
{}]
-
right_pad_
));
}
};
template
<
bool
SkipIsValidCheck
=
false
>
template
<
bool
SkipIsValidCheck
=
false
>
struct
DynamicLeftPad
struct
DynamicLeftPad
{
{
...
@@ -546,15 +643,17 @@ struct DynamicMerge
...
@@ -546,15 +643,17 @@ struct DynamicMerge
}
}
// idx_diff_low depends on idx_low_old, so idx_low need to be up-to-date
// idx_diff_low depends on idx_low_old, so idx_low need to be up-to-date
//
// If idx_diff_up is known at compile-time, many calculations can be optimized
// If idx_diff_up is known at compile-time, many calculations can be optimized
// away by compiler
// away by compiler
// This function assume idx_low_old is not out-of-bound
// This function assume idx_low_old is not out-of-bound
// this version save computation but use more register
template
<
typename
LowIdxDiff
,
template
<
typename
LowIdxDiff
,
typename
UpIdxDiff
,
typename
UpIdxDiff
,
typename
LowIdx
,
typename
LowIdx
,
typename
UpIdx
,
typename
UpIdx
,
index_t
Hack
>
index_t
Hack
>
__host__
__device__
constexpr
void
CalculateLowerIndexDiff_hack
(
LowIdxDiff
&
idx_diff_low
,
__host__
__device__
constexpr
void
CalculateLowerIndexDiff_hack
_1
(
LowIdxDiff
&
idx_diff_low
,
const
UpIdxDiff
&
idx_diff_up
,
const
UpIdxDiff
&
idx_diff_up
,
const
LowIdx
&
idx_low_old
,
const
LowIdx
&
idx_low_old
,
const
UpIdx
&
/* idx_up_old */
,
const
UpIdx
&
/* idx_up_old */
,
...
@@ -579,7 +678,10 @@ struct DynamicMerge
...
@@ -579,7 +678,10 @@ struct DynamicMerge
// computed at
// computed at
// run-time each time this function is called, and can be very expensive.
// run-time each time this function is called, and can be very expensive.
LowerIndex
idx_diff_low_const
;
LowerIndex
idx_diff_low_const
;
LowerIndex
idx_low_length_minus_idx_diff_low_const
;
LowerIndex
idx_low_length_plus_idx_diff_low_const
;
#if !CK_HACK_DYNAMIC_MERGE_CALCULATE_IDX_DIFF_LOW_CONST_USE_AMD_GCN_READ_FIRST_LANE
index_t
tmp
=
idx_diff_up
[
Number
<
0
>
{}];
index_t
tmp
=
idx_diff_up
[
Number
<
0
>
{}];
static_for
<
0
,
NDimLow
-
1
,
1
>
{}([
&
](
auto
i
)
{
static_for
<
0
,
NDimLow
-
1
,
1
>
{}([
&
](
auto
i
)
{
...
@@ -587,10 +689,6 @@ struct DynamicMerge
...
@@ -587,10 +689,6 @@ struct DynamicMerge
tmp
-=
idx_diff_low_const
[
i
]
*
low_lengths_scan_
[
i
];
tmp
-=
idx_diff_low_const
[
i
]
*
low_lengths_scan_
[
i
];
});
});
LowerIndex
idx_low_length_minus_idx_diff_low_const
;
LowerIndex
idx_low_length_plus_idx_diff_low_const
;
#if !CK_HACK_DYNAMIC_MERGE_CALCULATE_IDX_DIFF_LOW_CONST_USE_AMD_GCN_READ_FIRST_LANE
idx_diff_low_const
(
Number
<
NDimLow
-
1
>
{})
=
tmp
;
idx_diff_low_const
(
Number
<
NDimLow
-
1
>
{})
=
tmp
;
static_for
<
0
,
NDimLow
,
1
>
{}([
&
](
auto
i
)
{
static_for
<
0
,
NDimLow
,
1
>
{}([
&
](
auto
i
)
{
...
@@ -600,6 +698,13 @@ struct DynamicMerge
...
@@ -600,6 +698,13 @@ struct DynamicMerge
});
});
#else
#else
// Hack: this force result into SGPR. Need to make sure the result is thread invariant
// Hack: this force result into SGPR. Need to make sure the result is thread invariant
index_t
tmp
=
idx_diff_up
[
Number
<
0
>
{}];
static_for
<
0
,
NDimLow
-
1
,
1
>
{}([
&
](
auto
i
)
{
idx_diff_low_const
(
i
)
=
__builtin_amdgcn_readfirstlane
(
tmp
/
low_lengths_scan_
[
i
]);
tmp
-=
idx_diff_low_const
[
i
]
*
low_lengths_scan_
[
i
];
});
idx_diff_low_const
(
Number
<
NDimLow
-
1
>
{})
=
__builtin_amdgcn_readfirstlane
(
tmp
);
idx_diff_low_const
(
Number
<
NDimLow
-
1
>
{})
=
__builtin_amdgcn_readfirstlane
(
tmp
);
static_for
<
0
,
NDimLow
,
1
>
{}([
&
](
auto
i
)
{
static_for
<
0
,
NDimLow
,
1
>
{}([
&
](
auto
i
)
{
...
@@ -633,6 +738,27 @@ struct DynamicMerge
...
@@ -633,6 +738,27 @@ struct DynamicMerge
idx_diff_low
(
Number
<
0
>
{})
=
idx_diff_low_const
[
Number
<
0
>
{}]
+
carry
;
idx_diff_low
(
Number
<
0
>
{})
=
idx_diff_low_const
[
Number
<
0
>
{}]
+
carry
;
}
}
else
if
constexpr
(
Hack
==
2
)
else
if
constexpr
(
Hack
==
2
)
{
// do carry check on each low dimension in reversed order
// do not need to check the first dimension
index_t
borrow
=
0
;
static_for
<
NDimLow
-
1
,
0
,
-
1
>
{}([
&
](
auto
i
)
{
index_t
idx_low_tmp
=
idx_low_old
[
i
]
-
borrow
;
bool
do_borrow
=
idx_low_tmp
<
-
idx_diff_low_const
[
i
];
idx_diff_low
(
i
)
=
do_borrow
?
idx_low_length_plus_idx_diff_low_const
[
i
]
:
idx_diff_low_const
[
i
];
idx_diff_low
(
i
)
-=
borrow
;
borrow
=
do_borrow
?
1
:
0
;
});
idx_diff_low
(
Number
<
0
>
{})
=
idx_diff_low_const
[
Number
<
0
>
{}]
-
borrow
;
}
else
{
{
// do carry check on each low dimension in reversed order
// do carry check on each low dimension in reversed order
// do not need to check the first dimension
// do not need to check the first dimension
...
@@ -641,26 +767,127 @@ struct DynamicMerge
...
@@ -641,26 +767,127 @@ struct DynamicMerge
static_for
<
NDimLow
-
1
,
0
,
-
1
>
{}([
&
](
auto
i
)
{
static_for
<
NDimLow
-
1
,
0
,
-
1
>
{}([
&
](
auto
i
)
{
index_t
idx_low_tmp
=
idx_low_old
[
i
]
+
carry
;
index_t
idx_low_tmp
=
idx_low_old
[
i
]
+
carry
;
bool
do_carry
=
idx_low_tmp
>=
idx_low_length_minus_idx_diff_low_const
[
i
];
bool
do_borrow
=
idx_low_tmp
<
-
idx_diff_low_const
[
i
];
bool
do_borrow
=
idx_low_tmp
<
-
idx_diff_low_const
[
i
];
idx_diff_low
(
i
)
=
do_carry
?
-
idx_low_length_minus_idx_diff_low_const
[
i
]
:
idx_diff_low_const
[
i
];
idx_diff_low
(
i
)
=
idx_diff_low
(
i
)
=
do_borrow
?
idx_low_length_plus_idx_diff_low_const
[
i
]
:
idx_diff_low
[
i
];
do_borrow
?
idx_low_length_plus_idx_diff_low_const
[
i
]
:
idx_diff_low
[
i
];
idx_diff_low
(
i
)
+=
carry
;
idx_diff_low
(
i
)
+=
carry
;
carry
=
do_carry
?
1
:
0
;
carry
=
do_borrow
?
-
1
:
carry
;
carry
=
do_borrow
?
-
1
:
carry
;
});
});
idx_diff_low
(
Number
<
0
>
{})
=
idx_diff_low_const
[
Number
<
0
>
{}]
+
carry
;
idx_diff_low
(
Number
<
0
>
{})
=
idx_diff_low_const
[
Number
<
0
>
{}]
+
carry
;
}
}
}
// idx_diff_low depends on idx_low_old, so idx_low need to be up-to-date
// If idx_diff_up is known at compile-time, many calculations can be optimized
// away by compiler
// This function assume idx_low_old is not out-of-bound
// this version use less register but more computation
template
<
typename
LowIdxDiff
,
typename
UpIdxDiff
,
typename
LowIdx
,
typename
UpIdx
,
index_t
Hack
>
__host__
__device__
constexpr
void
CalculateLowerIndexDiff_hack_2
(
LowIdxDiff
&
idx_diff_low
,
const
UpIdxDiff
&
idx_diff_up
,
const
LowIdx
&
idx_low_old
,
const
UpIdx
&
/* idx_up_old */
,
Number
<
Hack
>
)
const
{
static_assert
(
LowIdxDiff
::
Size
()
==
NDimLow
&&
UpIdxDiff
::
Size
()
==
1
&&
LowIdx
::
Size
()
==
NDimLow
&&
UpIdx
::
Size
()
==
1
,
"wrong! inconsistent # of dimension"
);
// CalculateLowerIndex(idx_diff_low_const) has multiple integer divisions.
// However,
// 1) If idx_diff_up is known at compile-time, then idx_diff_low_const
// can be calculated at compile-time.
// 2) If idx_diff_up is not known at compile-time, but its value
// doesn't change during the whole kernel execution, then
// idx_diff_low_const also
// doesn't change during the whole kernel execution. Compiler generated
// ISA should
// only caclculate idx_diff_low_const once and save it durinng the whole
// kernel execution
// If neither 1) nor 2) is satisfied, then the calculation will also be
// computed at
// run-time each time this function is called, and can be very expensive.
LowerIndex
idx_diff_low_const
;
LowerIndex
idx_low_length_minus_idx_diff_low_const
;
LowerIndex
idx_low_length_plus_idx_diff_low_const
;
#if !CK_HACK_DYNAMIC_MERGE_CALCULATE_IDX_DIFF_LOW_CONST_USE_AMD_GCN_READ_FIRST_LANE
index_t
tmp
=
idx_diff_up
[
Number
<
0
>
{}];
static_for
<
0
,
NDimLow
-
1
,
1
>
{}([
&
](
auto
i
)
{
idx_diff_low_const
(
i
)
=
tmp
/
low_lengths_scan_
[
i
];
tmp
-=
idx_diff_low_const
[
i
]
*
low_lengths_scan_
[
i
];
});
idx_diff_low_const
(
Number
<
NDimLow
-
1
>
{})
=
tmp
;
#else
// Hack: this force result into SGPR. Need to make sure the result is thread invariant
index_t
tmp
=
idx_diff_up
[
Number
<
0
>
{}];
static_for
<
0
,
NDimLow
-
1
,
1
>
{}([
&
](
auto
i
)
{
idx_diff_low_const
(
i
)
=
__builtin_amdgcn_readfirstlane
(
tmp
/
low_lengths_scan_
[
i
]);
tmp
-=
idx_diff_low_const
[
i
]
*
low_lengths_scan_
[
i
];
});
idx_diff_low_const
(
Number
<
NDimLow
-
1
>
{})
=
__builtin_amdgcn_readfirstlane
(
tmp
);
#endif
if
constexpr
(
Hack
==
1
)
{
// do carry check on each low dimension in reversed order
// do not need to check the first dimension
bool
do_carry
=
0
;
static_for
<
NDimLow
-
1
,
0
,
-
1
>
{}([
&
](
auto
i
)
{
idx_diff_low
(
i
)
=
idx_diff_low_const
[
i
]
+
do_carry
;
index_t
idx_low_tmp
=
idx_low_old
[
i
]
+
idx_diff_low_const
[
i
]
+
do_carry
;
do_carry
=
idx_low_tmp
>=
low_lengths_
[
i
];
idx_diff_low
(
i
)
=
do_carry
?
idx_diff_low
(
i
)
-
low_lengths_
[
i
]
:
idx_diff_low
[
i
];
});
idx_diff_low
(
Number
<
0
>
{})
=
idx_diff_low_const
[
Number
<
0
>
{}]
+
do_carry
;
}
else
if
constexpr
(
Hack
==
2
)
{
// do carry check on each low dimension in reversed order
// do not need to check the first dimension
bool
do_borrow
=
0
;
static_for
<
NDimLow
-
1
,
0
,
-
1
>
{}([
&
](
auto
i
)
{
idx_diff_low
(
i
)
=
idx_diff_low_const
[
i
]
-
do_borrow
;
index_t
idx_low_tmp
=
idx_low_old
[
i
]
+
idx_diff_low_const
[
i
]
-
do_borrow
;
do_borrow
=
idx_low_tmp
<
0
;
idx_diff_low
(
i
)
=
do_borrow
?
idx_diff_low
(
i
)
+
low_lengths_
[
i
]
:
idx_diff_low
[
i
];
});
idx_diff_low
(
Number
<
0
>
{})
=
idx_diff_low_const
[
Number
<
0
>
{}]
-
do_borrow
;
}
else
else
{
{
#if 0
// do carry check on each low dimension in reversed order
// do carry check on each low dimension in reversed order
// do not need to check the first dimension
// do not need to check the first dimension
index_t carry = 0;
index_t carry = 0;
static_for<NDimLow - 1, 0, -1>{}([&](auto i) {
static_for<NDimLow - 1, 0, -1>{}([&](auto i) {
i
ndex_t
idx_low_tmp
=
idx_low_old
[
i
]
+
carry
;
i
dx_diff_low(i) = idx_diff_low_const
[i] + carry;
bool do_carry = idx_low_tmp >= idx_low_length_minus_idx_diff_low_const[i];
bool do_carry = idx_low_tmp >= idx_low_length_minus_idx_diff_low_const[i];
bool do_borrow = idx_low_tmp < -idx_diff_low_const[i];
bool do_borrow = idx_low_tmp < -idx_diff_low_const[i];
...
@@ -677,9 +904,191 @@ struct DynamicMerge
...
@@ -677,9 +904,191 @@ struct DynamicMerge
});
});
idx_diff_low(Number<0>{}) = idx_diff_low_const[Number<0>{}] + carry;
idx_diff_low(Number<0>{}) = idx_diff_low_const[Number<0>{}] + carry;
#endif
}
}
}
}
// idx_diff_low depends on idx_low_old, so idx_low need to be up-to-date
// If idx_diff_up is known at compile-time, many calculations can be optimized
// away by compiler
// This function assume idx_low_old is not out-of-bound
// this version use less register but more computation
template
<
typename
LowIdxDiff
,
typename
UpIdxDiff
,
typename
LowIdx
,
typename
UpIdx
,
index_t
Hack
>
__host__
__device__
constexpr
void
CalculateLowerIndexDiff_hack_3
(
LowIdxDiff
&
idx_diff_low
,
const
UpIdxDiff
&
idx_diff_up
,
const
LowIdx
&
idx_low_old
,
const
UpIdx
&
/* idx_up_old */
,
Number
<
Hack
>
)
const
{
static_assert
(
LowIdxDiff
::
Size
()
==
NDimLow
&&
UpIdxDiff
::
Size
()
==
1
&&
LowIdx
::
Size
()
==
NDimLow
&&
UpIdx
::
Size
()
==
1
,
"wrong! inconsistent # of dimension"
);
// CalculateLowerIndex(idx_diff_low_const) has multiple integer divisions.
// However,
// 1) If idx_diff_up is known at compile-time, then idx_diff_low_const
// can be calculated at compile-time.
// 2) If idx_diff_up is not known at compile-time, but its value
// doesn't change during the whole kernel execution, then
// idx_diff_low_const also
// doesn't change during the whole kernel execution. Compiler generated
// ISA should
// only caclculate idx_diff_low_const once and save it durinng the whole
// kernel execution
// If neither 1) nor 2) is satisfied, then the calculation will also be
// computed at
// run-time each time this function is called, and can be very expensive.
LowerIndex
idx_diff_low_const
;
LowerIndex
idx_low_length_minus_idx_diff_low_const
;
LowerIndex
idx_low_length_plus_idx_diff_low_const
;
#if !CK_HACK_DYNAMIC_MERGE_CALCULATE_IDX_DIFF_LOW_CONST_USE_AMD_GCN_READ_FIRST_LANE
index_t
tmp
=
idx_diff_up
[
Number
<
0
>
{}];
static_for
<
0
,
NDimLow
-
1
,
1
>
{}([
&
](
auto
i
)
{
idx_diff_low_const
(
i
)
=
tmp
/
low_lengths_scan_
[
i
];
tmp
-=
idx_diff_low_const
[
i
]
*
low_lengths_scan_
[
i
];
});
idx_diff_low_const
(
Number
<
NDimLow
-
1
>
{})
=
tmp
;
#else
// Hack: this force result into SGPR. Need to make sure the result is thread invariant
index_t
tmp
=
idx_diff_up
[
Number
<
0
>
{}];
static_for
<
0
,
NDimLow
-
1
,
1
>
{}([
&
](
auto
i
)
{
idx_diff_low_const
(
i
)
=
__builtin_amdgcn_readfirstlane
(
tmp
/
low_lengths_scan_
[
i
]);
tmp
-=
idx_diff_low_const
[
i
]
*
low_lengths_scan_
[
i
];
});
idx_diff_low_const
(
Number
<
NDimLow
-
1
>
{})
=
__builtin_amdgcn_readfirstlane
(
tmp
);
#endif
if
constexpr
(
Hack
==
1
)
{
#if 1
// do carry check on each low dimension in reversed order
// do not need to check the first dimension
bool
do_carry
=
0
;
static_for
<
NDimLow
-
1
,
0
,
-
1
>
{}([
&
](
auto
i
)
{
idx_diff_low
(
i
)
=
idx_diff_low_const
[
i
]
+
do_carry
;
index_t
idx_low_tmp
=
idx_low_old
[
i
]
+
idx_diff_low_const
[
i
]
+
do_carry
;
do_carry
=
idx_low_tmp
>=
low_lengths_
[
i
];
idx_diff_low
(
i
)
=
do_carry
?
idx_diff_low
(
i
)
-
low_lengths_
[
i
]
:
idx_diff_low
[
i
];
});
idx_diff_low
(
Number
<
0
>
{})
=
idx_diff_low_const
[
Number
<
0
>
{}]
+
do_carry
;
#else
LowerIndex
idx_low_new
=
idx_low_old
;
static_for
<
NDimLow
-
1
,
0
,
-
1
>
{}([
&
](
auto
i
)
{
auto
i_m1
=
i
-
Number
<
1
>
{};
int64_t
exec_mask
;
int64_t
do_carry
;
idx_low_new
(
i
)
=
idx_diff_low_const
[
i
]
+
idx_low_old
[
i
];
asm
volatile
(
"
\n
\
s_mov_b64 %0, exec
\n
\
v_cmpx_le_u32_e64 %1, %4, %2
\n
\
v_subrev_u32 %2, %4, %2
\n
\
v_add_u32 %3, %3, 1
\n
\
s_mov_b64 exec, %0
\n
\
"
:
"=s"
(
exec_mask
),
"=s"
(
do_carry
),
"=v"
(
idx_low_new
(
i
)),
"=v"
(
idx_low_new
(
i_m1
))
:
"s"
(
low_lengths_
[
i
]),
"2"
(
idx_low_new
[
i
]),
"3"
(
idx_low_new
[
i_m1
]));
idx_diff_low
(
i
)
=
idx_low_new
[
i
]
-
idx_low_old
[
i
];
});
constexpr
auto
I0
=
Number
<
0
>
{};
idx_low_new
(
I0
)
+=
idx_diff_low_const
[
I0
];
idx_diff_low
(
I0
)
=
idx_low_new
[
I0
]
-
idx_low_old
[
I0
];
#endif
}
else
if
constexpr
(
Hack
==
2
)
{
// do carry check on each low dimension in reversed order
// do not need to check the first dimension
bool
do_borrow
=
0
;
static_for
<
NDimLow
-
1
,
0
,
-
1
>
{}([
&
](
auto
i
)
{
idx_diff_low
(
i
)
=
idx_diff_low_const
[
i
]
-
do_borrow
;
index_t
idx_low_tmp
=
idx_low_old
[
i
]
+
idx_diff_low_const
[
i
]
-
do_borrow
;
do_borrow
=
idx_low_tmp
<
0
;
idx_diff_low
(
i
)
=
do_borrow
?
idx_diff_low
(
i
)
+
low_lengths_
[
i
]
:
idx_diff_low
[
i
];
});
idx_diff_low
(
Number
<
0
>
{})
=
idx_diff_low_const
[
Number
<
0
>
{}]
-
do_borrow
;
}
else
{
#if 0
// do carry check on each low dimension in reversed order
// do not need to check the first dimension
index_t carry = 0;
static_for<NDimLow - 1, 0, -1>{}([&](auto i) {
idx_diff_low(i) = idx_diff_low_const[i] + carry;
bool do_carry = idx_low_tmp >= idx_low_length_minus_idx_diff_low_const[i];
bool do_borrow = idx_low_tmp < -idx_diff_low_const[i];
idx_diff_low(i) =
do_carry ? -idx_low_length_minus_idx_diff_low_const[i] : idx_diff_low_const[i];
idx_diff_low(i) =
do_borrow ? idx_low_length_plus_idx_diff_low_const[i] : idx_diff_low[i];
idx_diff_low(i) += carry;
carry = do_carry ? 1 : 0;
carry = do_borrow ? -1 : carry;
});
idx_diff_low(Number<0>{}) = idx_diff_low_const[Number<0>{}] + carry;
#endif
}
}
template
<
typename
LowIdxDiff
,
typename
UpIdxDiff
,
typename
LowIdx
,
typename
UpIdx
,
index_t
Hack
>
__host__
__device__
constexpr
void
CalculateLowerIndexDiff_hack
(
LowIdxDiff
&
idx_diff_low
,
const
UpIdxDiff
&
idx_diff_up
,
const
LowIdx
&
idx_low_old
,
const
UpIdx
&
idx_up_old
,
Number
<
Hack
>
)
const
{
#if 0
// this version save computation but use more register
CalculateLowerIndexDiff_hack_1(
idx_diff_low, idx_diff_up, idx_low_old, idx_up_old, Number<Hack>{});
#elif
1
// this version use less register but more computation
CalculateLowerIndexDiff_hack_2
(
idx_diff_low
,
idx_diff_up
,
idx_low_old
,
idx_up_old
,
Number
<
Hack
>
{});
#elif 1
// this version use less register but more computation
CalculateLowerIndexDiff_hack_3
(
idx_diff_low
,
idx_diff_up
,
idx_low_old
,
idx_up_old
,
Number
<
Hack
>
{});
#endif
}
__host__
__device__
static
constexpr
bool
IsLinearTransform
()
{
return
false
;
}
__host__
__device__
static
constexpr
bool
IsLinearTransform
()
{
return
false
;
}
__host__
__device__
static
constexpr
bool
IsValidUpperIndexAlwaysMappedToValidLowerIndex
()
__host__
__device__
static
constexpr
bool
IsValidUpperIndexAlwaysMappedToValidLowerIndex
()
...
@@ -695,7 +1104,7 @@ struct DynamicMerge
...
@@ -695,7 +1104,7 @@ struct DynamicMerge
}
}
};
};
template
<
index_t
NDimUp
>
template
<
index_t
NDimUp
,
bool
Use24BitIntegerCalculation
=
false
>
struct
DynamicUnMerge
struct
DynamicUnMerge
{
{
using
LowerIndex
=
MultiIndex
<
1
>
;
using
LowerIndex
=
MultiIndex
<
1
>
;
...
@@ -726,12 +1135,26 @@ struct DynamicUnMerge
...
@@ -726,12 +1135,26 @@ struct DynamicUnMerge
template
<
typename
LowIdx
,
typename
UpIdx
>
template
<
typename
LowIdx
,
typename
UpIdx
>
__host__
__device__
constexpr
void
CalculateLowerIndex
(
LowIdx
&
idx_low
,
__host__
__device__
constexpr
void
CalculateLowerIndex
(
LowIdx
&
idx_low
,
const
UpIdx
&
idx_up
)
const
const
UpIdx
&
idx_up
)
const
{
if
constexpr
(
!
Use24BitIntegerCalculation
)
{
{
idx_low
(
Number
<
0
>
{})
=
idx_up
[
Number
<
NDimUp
-
1
>
{}];
idx_low
(
Number
<
0
>
{})
=
idx_up
[
Number
<
NDimUp
-
1
>
{}];
static_for
<
0
,
NDimUp
-
1
,
1
>
{}(
static_for
<
0
,
NDimUp
-
1
,
1
>
{}(
[
&
](
auto
i
)
{
idx_low
(
Number
<
0
>
{})
+=
idx_up
[
i
]
*
up_lengths_scan_
[
i
];
});
[
&
](
auto
i
)
{
idx_low
(
Number
<
0
>
{})
+=
idx_up
[
i
]
*
up_lengths_scan_
[
i
];
});
}
}
else
{
idx_low
(
Number
<
0
>
{})
=
idx_up
[
Number
<
NDimUp
-
1
>
{}];
static_for
<
0
,
NDimUp
-
1
,
1
>
{}([
&
](
auto
i
)
{
idx_low
(
Number
<
0
>
{})
=
(
0x00ffffff
&
idx_low
[
Number
<
0
>
{}])
+
(
0x00ffffff
&
idx_up
[
i
])
*
(
0x00ffffff
&
up_lengths_scan_
[
i
]);
});
}
}
template
<
typename
LowIdxDiff
,
typename
UpIdxDiff
,
typename
LowIdx
,
typename
UpIdx
>
template
<
typename
LowIdxDiff
,
typename
UpIdxDiff
,
typename
LowIdx
,
typename
UpIdx
>
__host__
__device__
constexpr
void
CalculateLowerIndexDiff
(
LowIdxDiff
&
idx_diff_low
,
__host__
__device__
constexpr
void
CalculateLowerIndexDiff
(
LowIdxDiff
&
idx_diff_low
,
...
@@ -836,5 +1259,118 @@ struct DynamicFreeze
...
@@ -836,5 +1259,118 @@ struct DynamicFreeze
}
}
};
};
#if 0
template <index_t NDimUp, typename StaticPartialUpLengths>
struct HackSemiDynamicUnMerge
{
using LowerIndex = MultiIndex<1>;
using UpperIndex = MultiIndex<NDimUp>;
const UpperIndex up_lengths_;
const UpperIndex up_lengths_scan_;
static constexpr index_t NDimUpStatic = StaticPartialUpLengths::Size();
static constexpr index_t NDimUpDynamic = NDimUp - NDimUpStatic;
const MultiIndex<NDimUpDynamic> dynamic_partial_up_lengths_;
const MultiIndex<NDimUpDynamic> dynamic_partial_up_lengths_scan_;
static constexpr auto static_partial_up_lengths_ = StaticPartialUpLengths{};
static constexpr auto static_partial_up_lengths_scan_ = reverse_exclusive_scan_sequence(
static_partial_up_lengths_, math::multiplies<index_t>(), Number<1>{});
__host__ __device__ constexpr HackSemiDynamicUnMerge(
const MultiIndex<NDimUpDynamic>& dynamic_partial_up_lengths)
: dynamic_partial_up_lengths_{dynamic_partial_up_lengths},
dynamic_partial_up_lengths_scan_{
container_reverse_exclusive_scan(dynamic_partial_up_lengths,
math::multiplies<index_t>(),
static_partial_up_lengths_scan_[Number<0>{}])}
{
static_assert(NDimUpDynamic + NDimUpStatic == NDimUp,
"wrong! inconsisitent # of dimensions");
}
__host__ __device__ constexpr HackSemiDynamicUnMerge()
: up_lengths_{make_zero_multi_index<NDimUpDynamic>()},
up_lengths_scan_{make_zero_multi_index<NDimUpStatic>()}
{
static_assert(NDimUpDynamic + NDimUpStatic == NDimUp,
"wrong! inconsisitent # of dimensions");
}
__host__ __device__ static constexpr index_t GetNumOfLowerDimension() { return 1; }
__host__ __device__ static constexpr index_t GetNumOfUpperDimension() { return NDimUp; }
__host__ __device__ constexpr const auto GetUpperLengths() const
{
UpperIndex up_lengths;
static_for<0, NDimUpDynamic, 1>{}(
[&](auto i) { up_lengths(i) = dynamic_partial_up_lengths_[i]; });
static_for<0, NDimUpStatic, 1>{}([&](auto i) {
up_lengths(i + Number<NDimUpDynamic>{}) = static_partial_up_lengths_[i];
});
return up_lengths;
}
template <typename LowIdx, typename UpIdx>
__host__ __device__ constexpr void CalculateLowerIndex(LowIdx& idx_low,
const UpIdx& idx_up) const
{
idx_low(Number<0>{}) = idx_up[Number<NDimUp - 1>{}];
static_for<0, NDimUpDynamic, 1>{}([&](auto i) {
idx_low(Number<0>{}) += idx_up[i] * dynamic_partial_up_lengths_scan_[i];
});
static_for<NDimUpDynamic, NDimUp - 1, 1>{}([&](auto i) {
idx_low(Number<0>{}) +=
idx_up[i] * static_partial_up_lengths_scan_[i + Number<NDimUpDynamic>{}];
});
}
template <typename LowIdxDiff, typename UpIdxDiff, typename LowIdx, typename UpIdx>
__host__ __device__ constexpr void CalculateLowerIndexDiff(LowIdxDiff& idx_diff_low,
const UpIdxDiff& idx_diff_up,
const LowIdx& /* idx_low_old */,
const UpIdx& /* idx_up_old */) const
{
CalculateLowerIndex(idx_diff_low, idx_diff_up);
}
template <typename LowIdxDiff,
typename UpIdxDiff,
typename LowIdx,
typename UpIdx,
index_t Hack>
__host__ __device__ constexpr void CalculateLowerIndexDiff_hack(LowIdxDiff& idx_diff_low,
const UpIdxDiff& idx_diff_up,
const LowIdx& idx_low_old,
const UpIdx& idx_up_old,
Number<Hack>) const
{
CalculateLowerIndexDiff(idx_diff_low, idx_diff_up, idx_low_old, idx_up_old);
}
__host__ __device__ static constexpr bool IsLinearTransform() { return true; }
__host__ __device__ static constexpr bool IsValidUpperIndexAlwaysMappedToValidLowerIndex()
{
return true;
}
template <typename UpIdx>
__host__ __device__ static constexpr bool
IsValidUpperIndexMappedToValidLowerIndex(const UpIdx& /* idx_up */)
{
return true;
}
};
#endif
}
// namespace ck
}
// namespace ck
#endif
#endif
composable_kernel/include/tensor_operation/blockwise_dynamic_tensor_slice_transfer.hpp
View file @
77c81617
...
@@ -96,162 +96,39 @@ struct BlockwiseDynamicTensorSliceTransfer_v4
...
@@ -96,162 +96,39 @@ struct BlockwiseDynamicTensorSliceTransfer_v4
}
}
}
}
__device__
void
RunWrite
(
const
DstDesc
&
dst_desc
,
DstData
*
p_dst
)
__device__
void
RunRead_hack
(
const
SrcDesc
&
src_desc
,
const
SrcData
*
p_src
)
{
if
(
BlockSize
==
thread_cluster_desc_
.
GetElementSize
()
or
get_thread_local_1d_id
()
<
thread_cluster_desc_
.
GetElementSize
())
{
threadwise_transfer_
.
RunWrite
(
dst_desc
,
p_dst
);
}
}
__device__
void
MoveSrcSliceWindow
(
const
SrcDesc
&
src_desc
,
const
Index
&
step
)
{
if
(
BlockSize
==
thread_cluster_desc_
.
GetElementSize
()
or
get_thread_local_1d_id
()
<
thread_cluster_desc_
.
GetElementSize
())
{
threadwise_transfer_
.
MoveSrcSliceWindow
(
src_desc
,
step
);
}
}
__device__
void
MoveDstSliceWindow
(
const
DstDesc
&
dst_desc
,
const
Index
&
step
)
{
{
if
(
BlockSize
==
thread_cluster_desc_
.
GetElementSize
()
or
if
(
BlockSize
==
thread_cluster_desc_
.
GetElementSize
()
or
get_thread_local_1d_id
()
<
thread_cluster_desc_
.
GetElementSize
())
get_thread_local_1d_id
()
<
thread_cluster_desc_
.
GetElementSize
())
{
{
threadwise_transfer_
.
MoveDstSliceWindow
(
dst
_desc
,
step
);
threadwise_transfer_
.
RunRead_hack
(
src
_desc
,
p_src
);
}
}
}
}
static
constexpr
auto
thread_cluster_desc_
=
__device__
void
RunWrite
(
const
DstDesc
&
dst_desc
,
DstData
*
p_dst
)
make_cluster_descriptor
(
ThreadClusterLengths
{},
ThreadClusterArrangeOrder
{});
using
ThreadwiseTransfer
=
ThreadwiseDynamicTensorSliceTransfer_v3
<
ThreadSliceLengths
,
DstInMemOp
,
SrcData
,
DstData
,
SrcDesc
,
DstDesc
,
SrcDimAccessOrder
,
DstDimAccessOrder
,
SrcVectorDim
,
DstVectorDim
,
SrcScalarPerVector
,
DstScalarPerVector
,
SrcScalarStrideInVector
,
DstScalarStrideInVector
,
SrcAddressSpace
,
DstAddressSpace
,
ThreadTransferSrcResetCoordinateAfterRun
,
ThreadTransferDstResetCoordinateAfterRun
>
;
ThreadwiseTransfer
threadwise_transfer_
;
};
// this version does following things to avoid scratch memory issue
// 1. Use StaticallyIndexedArray instead of C array for thread buffer
// 2. ThreadwiseDynamicTensorSliceTransfer_v3 does not keep reference to tensor descriptor
// 3. ThreadwiseDynamicTensorSliceTransfer_v3::Run() does not construct new tensor coordinate
template
<
index_t
BlockSize
,
InMemoryDataOperation
DstInMemOp
,
typename
BlockSliceLengths
,
typename
ThreadSliceLengths
,
typename
ThreadClusterLengths
,
typename
ThreadClusterArrangeOrder
,
typename
SrcData
,
typename
DstData
,
typename
SrcDesc
,
typename
DstDesc
,
typename
SrcDimAccessOrder
,
typename
DstDimAccessOrder
,
index_t
SrcVectorDim
,
index_t
DstVectorDim
,
index_t
SrcScalarPerVector
,
index_t
DstScalarPerVector
,
AddressSpace
SrcAddressSpace
,
AddressSpace
DstAddressSpace
,
index_t
SrcScalarStrideInVector
,
index_t
DstScalarStrideInVector
,
index_t
ThreadTransferSrcResetCoordinateAfterRun
,
index_t
ThreadTransferDstResetCoordinateAfterRun
>
struct
BlockwiseDynamicTensorSliceTransfer_v4_hack
{
static
constexpr
index_t
nDim
=
remove_reference_t
<
SrcDesc
>::
GetNumOfDimension
();
using
Index
=
MultiIndex
<
nDim
>
;
__device__
constexpr
BlockwiseDynamicTensorSliceTransfer_v4_hack
(
const
SrcDesc
&
src_desc
,
const
Index
&
src_block_slice_origin
,
const
DstDesc
&
dst_desc
,
const
Index
&
dst_block_slice_origin
)
:
threadwise_transfer_
(
src_desc
,
make_zero_multi_index
<
nDim
>
(),
dst_desc
,
make_zero_multi_index
<
nDim
>
())
{
static_assert
(
nDim
==
remove_reference_t
<
remove_cv_t
<
SrcDesc
>>::
GetNumOfDimension
()
&&
nDim
==
remove_reference_t
<
remove_cv_t
<
DstDesc
>>::
GetNumOfDimension
()
&&
nDim
==
BlockSliceLengths
::
Size
()
&&
nDim
==
ThreadSliceLengths
::
Size
()
&&
nDim
==
ThreadClusterLengths
::
Size
()
&&
nDim
==
ThreadClusterArrangeOrder
::
Size
()
&&
nDim
==
SrcDimAccessOrder
::
Size
()
&&
nDim
==
DstDimAccessOrder
::
Size
(),
"wrong! nDim not consistent"
);
static_assert
(
is_same
<
BlockSliceLengths
,
decltype
(
ThreadSliceLengths
{}
*
ThreadClusterLengths
{})
>
{},
"wrong! threads should be mapped to cover entire slicing window"
);
static_assert
(
BlockSize
>=
thread_cluster_desc_
.
GetElementSize
(),
"wrong! BlockSize too small"
);
if
(
BlockSize
==
thread_cluster_desc_
.
GetElementSize
()
or
get_thread_local_1d_id
()
<
thread_cluster_desc_
.
GetElementSize
())
{
const
auto
thread_cluster_id
=
thread_cluster_desc_
.
CalculateClusterIndex
(
get_thread_local_1d_id
());
const
auto
thread_data_id_begin
=
thread_cluster_id
*
ThreadSliceLengths
{};
threadwise_transfer_
.
SetSrcSliceOrigin
(
src_desc
,
src_block_slice_origin
+
thread_data_id_begin
);
threadwise_transfer_
.
SetDstSliceOrigin
(
dst_desc
,
dst_block_slice_origin
+
thread_data_id_begin
);
}
}
__device__
static
constexpr
auto
CalculateThreadDataBegin
()
{
const
auto
thread_cluster_id
=
thread_cluster_desc_
.
CalculateClusterIndex
(
get_thread_local_1d_id
());
return
thread_cluster_id
*
ThreadSliceLengths
{};
}
__device__
void
RunRead
(
const
SrcDesc
&
src_desc
,
const
SrcData
*
p_src
)
{
{
if
(
BlockSize
==
thread_cluster_desc_
.
GetElementSize
()
or
if
(
BlockSize
==
thread_cluster_desc_
.
GetElementSize
()
or
get_thread_local_1d_id
()
<
thread_cluster_desc_
.
GetElementSize
())
get_thread_local_1d_id
()
<
thread_cluster_desc_
.
GetElementSize
())
{
{
threadwise_transfer_
.
Run
Read
(
src
_desc
,
p_
src
);
threadwise_transfer_
.
Run
Write
(
dst
_desc
,
p_
dst
);
}
}
}
}
__device__
void
RunWrite
(
const
Dst
Desc
&
dst
_desc
,
DstData
*
p_dst
)
__device__
void
MoveSrcSliceWindow
(
const
Src
Desc
&
src
_desc
,
const
Index
&
step
)
{
{
if
(
BlockSize
==
thread_cluster_desc_
.
GetElementSize
()
or
if
(
BlockSize
==
thread_cluster_desc_
.
GetElementSize
()
or
get_thread_local_1d_id
()
<
thread_cluster_desc_
.
GetElementSize
())
get_thread_local_1d_id
()
<
thread_cluster_desc_
.
GetElementSize
())
{
{
threadwise_transfer_
.
RunWrite
(
dst
_desc
,
p_d
st
);
threadwise_transfer_
.
MoveSrcSliceWindow
(
src
_desc
,
st
ep
);
}
}
}
}
__device__
void
MoveSrcSliceWindow
(
const
SrcDesc
&
src_desc
,
const
Index
&
step
)
__device__
void
MoveSrcSliceWindow
_hack
(
const
SrcDesc
&
src_desc
,
const
Index
&
step
)
{
{
if
(
BlockSize
==
thread_cluster_desc_
.
GetElementSize
()
or
if
(
BlockSize
==
thread_cluster_desc_
.
GetElementSize
()
or
get_thread_local_1d_id
()
<
thread_cluster_desc_
.
GetElementSize
())
get_thread_local_1d_id
()
<
thread_cluster_desc_
.
GetElementSize
())
{
{
threadwise_transfer_
.
MoveSrcSliceWindow
(
src_desc
,
step
);
threadwise_transfer_
.
MoveSrcSliceWindow
_hack
(
src_desc
,
step
);
}
}
}
}
...
@@ -268,7 +145,7 @@ struct BlockwiseDynamicTensorSliceTransfer_v4_hack
...
@@ -268,7 +145,7 @@ struct BlockwiseDynamicTensorSliceTransfer_v4_hack
make_cluster_descriptor
(
ThreadClusterLengths
{},
ThreadClusterArrangeOrder
{});
make_cluster_descriptor
(
ThreadClusterLengths
{},
ThreadClusterArrangeOrder
{});
using
ThreadwiseTransfer
=
using
ThreadwiseTransfer
=
ThreadwiseDynamicTensorSliceTransfer_v3
_hack
<
ThreadSliceLengths
,
ThreadwiseDynamicTensorSliceTransfer_v3
<
ThreadSliceLengths
,
DstInMemOp
,
DstInMemOp
,
SrcData
,
SrcData
,
DstData
,
DstData
,
...
...
composable_kernel/include/tensor_operation/gridwise_dynamic_gemm.hpp
View file @
77c81617
...
@@ -166,7 +166,7 @@ struct GridwiseDynamicGemm_km_kn_mn_v1
...
@@ -166,7 +166,7 @@ struct GridwiseDynamicGemm_km_kn_mn_v1
// B matrix blockwise copy
// B matrix blockwise copy
auto
b_blockwise_copy
=
auto
b_blockwise_copy
=
BlockwiseDynamicTensorSliceTransfer_v4
_hack
<
BlockSize
,
BlockwiseDynamicTensorSliceTransfer_v4
<
BlockSize
,
InMemoryDataOperation
::
Set
,
InMemoryDataOperation
::
Set
,
Sequence
<
KPerBlock
,
NPerBlock
>
,
Sequence
<
KPerBlock
,
NPerBlock
>
,
BBlockTransferThreadSliceLengths_K_N
,
BBlockTransferThreadSliceLengths_K_N
,
...
@@ -258,16 +258,14 @@ struct GridwiseDynamicGemm_km_kn_mn_v1
...
@@ -258,16 +258,14 @@ struct GridwiseDynamicGemm_km_kn_mn_v1
constexpr
auto
a_block_slice_copy_step
=
make_multi_index
(
KPerBlock
,
0
);
constexpr
auto
a_block_slice_copy_step
=
make_multi_index
(
KPerBlock
,
0
);
constexpr
auto
b_block_slice_copy_step
=
make_multi_index
(
KPerBlock
,
0
);
constexpr
auto
b_block_slice_copy_step
=
make_multi_index
(
KPerBlock
,
0
);
#if 1
// LDS double buffer: preload data into LDS
// LDS double buffer: preload data into LDS
{
{
a_blockwise_copy
.
RunRead
(
a_k_m_global_desc
,
p_a_global
);
a_blockwise_copy
.
RunRead
(
a_k_m_global_desc
,
p_a_global
);
b_blockwise_copy
.
RunRead
(
b_k_n_global_desc
,
p_b_global
);
b_blockwise_copy
.
RunRead
_hack
(
b_k_n_global_desc
,
p_b_global
);
a_blockwise_copy
.
RunWrite
(
a_k_m_block_desc
,
p_a_block_double
);
a_blockwise_copy
.
RunWrite
(
a_k_m_block_desc
,
p_a_block_double
);
b_blockwise_copy
.
RunWrite
(
b_k_n_block_desc
,
p_b_block_double
);
b_blockwise_copy
.
RunWrite
(
b_k_n_block_desc
,
p_b_block_double
);
}
}
#endif
if
constexpr
(
HasMainKBlockLoop
)
if
constexpr
(
HasMainKBlockLoop
)
{
{
...
@@ -285,13 +283,14 @@ struct GridwiseDynamicGemm_km_kn_mn_v1
...
@@ -285,13 +283,14 @@ struct GridwiseDynamicGemm_km_kn_mn_v1
{
{
// even iteration
// even iteration
a_blockwise_copy
.
MoveSrcSliceWindow
(
a_k_m_global_desc
,
a_block_slice_copy_step
);
a_blockwise_copy
.
MoveSrcSliceWindow
(
a_k_m_global_desc
,
a_block_slice_copy_step
);
b_blockwise_copy
.
MoveSrcSliceWindow
(
b_k_n_global_desc
,
b_block_slice_copy_step
);
b_blockwise_copy
.
MoveSrcSliceWindow_hack
(
b_k_n_global_desc
,
b_block_slice_copy_step
);
__syncthreads
();
__syncthreads
();
// LDS doubel buffer: load next data from device mem
// LDS doubel buffer: load next data from device mem
a_blockwise_copy
.
RunRead
(
a_k_m_global_desc
,
p_a_global
);
a_blockwise_copy
.
RunRead
(
a_k_m_global_desc
,
p_a_global
);
b_blockwise_copy
.
RunRead
(
b_k_n_global_desc
,
p_b_global
);
b_blockwise_copy
.
RunRead
_hack
(
b_k_n_global_desc
,
p_b_global
);
// LDS double buffer: GEMM on current data
// LDS double buffer: GEMM on current data
blockwise_gemm
.
Run
(
p_a_block_even
,
p_b_block_even
,
p_c_thread
);
blockwise_gemm
.
Run
(
p_a_block_even
,
p_b_block_even
,
p_c_thread
);
...
@@ -302,13 +301,14 @@ struct GridwiseDynamicGemm_km_kn_mn_v1
...
@@ -302,13 +301,14 @@ struct GridwiseDynamicGemm_km_kn_mn_v1
// odd iteration
// odd iteration
a_blockwise_copy
.
MoveSrcSliceWindow
(
a_k_m_global_desc
,
a_block_slice_copy_step
);
a_blockwise_copy
.
MoveSrcSliceWindow
(
a_k_m_global_desc
,
a_block_slice_copy_step
);
b_blockwise_copy
.
MoveSrcSliceWindow
(
b_k_n_global_desc
,
b_block_slice_copy_step
);
b_blockwise_copy
.
MoveSrcSliceWindow_hack
(
b_k_n_global_desc
,
b_block_slice_copy_step
);
__syncthreads
();
__syncthreads
();
// LDS doubel buffer: load next data from device mem
// LDS doubel buffer: load next data from device mem
a_blockwise_copy
.
RunRead
(
a_k_m_global_desc
,
p_a_global
);
a_blockwise_copy
.
RunRead
(
a_k_m_global_desc
,
p_a_global
);
b_blockwise_copy
.
RunRead
(
b_k_n_global_desc
,
p_b_global
);
b_blockwise_copy
.
RunRead
_hack
(
b_k_n_global_desc
,
p_b_global
);
// LDS double buffer: GEMM on current data
// LDS double buffer: GEMM on current data
blockwise_gemm
.
Run
(
p_a_block_odd
,
p_b_block_odd
,
p_c_thread
);
blockwise_gemm
.
Run
(
p_a_block_odd
,
p_b_block_odd
,
p_c_thread
);
...
@@ -326,13 +326,13 @@ struct GridwiseDynamicGemm_km_kn_mn_v1
...
@@ -326,13 +326,13 @@ struct GridwiseDynamicGemm_km_kn_mn_v1
if
constexpr
(
HasDoubleTailKBlockLoop
)
// if has 2 iteration left
if
constexpr
(
HasDoubleTailKBlockLoop
)
// if has 2 iteration left
{
{
a_blockwise_copy
.
MoveSrcSliceWindow
(
a_k_m_global_desc
,
a_block_slice_copy_step
);
a_blockwise_copy
.
MoveSrcSliceWindow
(
a_k_m_global_desc
,
a_block_slice_copy_step
);
b_blockwise_copy
.
MoveSrcSliceWindow
(
b_k_n_global_desc
,
b_block_slice_copy_step
);
b_blockwise_copy
.
MoveSrcSliceWindow
_hack
(
b_k_n_global_desc
,
b_block_slice_copy_step
);
__syncthreads
();
__syncthreads
();
// LDS double buffer: load last data from device mem
// LDS double buffer: load last data from device mem
a_blockwise_copy
.
RunRead
(
a_k_m_global_desc
,
p_a_global
);
a_blockwise_copy
.
RunRead
(
a_k_m_global_desc
,
p_a_global
);
b_blockwise_copy
.
RunRead
(
b_k_n_global_desc
,
p_b_global
);
b_blockwise_copy
.
RunRead
_hack
(
b_k_n_global_desc
,
p_b_global
);
// LDS double buffer: GEMM on 2nd-last data
// LDS double buffer: GEMM on 2nd-last data
blockwise_gemm
.
Run
(
p_a_block_double
,
p_b_block_double
,
p_c_thread
);
blockwise_gemm
.
Run
(
p_a_block_double
,
p_b_block_double
,
p_c_thread
);
...
@@ -384,8 +384,13 @@ struct GridwiseDynamicGemm_km_kn_mn_v1
...
@@ -384,8 +384,13 @@ struct GridwiseDynamicGemm_km_kn_mn_v1
Float
,
Float
,
decltype
(
c_m0_m1_n0_n1_thread_desc
),
decltype
(
c_m0_m1_n0_n1_thread_desc
),
decltype
(
c_m0_m1_n0_n1_global_desc
),
decltype
(
c_m0_m1_n0_n1_global_desc
),
#if 1 // debug
Sequence
<
MRepeat
,
MPerThread
,
NRepeat
,
NPerThread
>
,
Sequence
<
MRepeat
,
MPerThread
,
NRepeat
,
NPerThread
>
,
CThreadTransferSrcDstAccessOrder
,
CThreadTransferSrcDstAccessOrder
,
#else
Sequence
<
1
,
1
,
2
,
4
>
,
Sequence
<
0
,
1
,
2
,
3
>
,
#endif
CThreadTransferSrcDstVectorDim
,
CThreadTransferSrcDstVectorDim
,
1
,
1
,
CThreadTransferDstScalarPerVector
,
CThreadTransferDstScalarPerVector
,
...
@@ -402,7 +407,8 @@ struct GridwiseDynamicGemm_km_kn_mn_v1
...
@@ -402,7 +407,8 @@ struct GridwiseDynamicGemm_km_kn_mn_v1
m_thread_data_on_global
%
M1
,
m_thread_data_on_global
%
M1
,
n_thread_data_on_global
/
N1
,
n_thread_data_on_global
/
N1
,
n_thread_data_on_global
%
N1
))
n_thread_data_on_global
%
N1
))
.
Run
(
c_m0_m1_n0_n1_thread_desc
,
p_c_thread
,
c_m0_m1_n0_n1_global_desc
,
p_c_global
);
.
Run_hack
(
c_m0_m1_n0_n1_thread_desc
,
p_c_thread
,
c_m0_m1_n0_n1_global_desc
,
p_c_global
);
}
}
}
}
...
...
composable_kernel/include/tensor_operation/threadwise_dynamic_tensor_slice_transfer.hpp
View file @
77c81617
...
@@ -25,7 +25,7 @@ template <typename SrcData,
...
@@ -25,7 +25,7 @@ template <typename SrcData,
index_t
SrcScalarStrideInVector
,
index_t
SrcScalarStrideInVector
,
index_t
DstScalarStrideInVector
,
index_t
DstScalarStrideInVector
,
bool
SrcResetCoordinateAfterRun
,
// control whether to move back src coordinate after each
bool
SrcResetCoordinateAfterRun
,
// control whether to move back src coordinate after each
// Run
Read
(), will be fused with MoveSrcSliceWindow to
// Run(), will be fused with MoveSrcSliceWindow to
// save addr computation
// save addr computation
bool
DstResetCoordinateAfterRun
>
// control whether to move back dst coordinate after each
bool
DstResetCoordinateAfterRun
>
// control whether to move back dst coordinate after each
// RunWrite(), will be fused with MoveDstSliceWindow to
// RunWrite(), will be fused with MoveDstSliceWindow to
...
@@ -333,7 +333,307 @@ struct ThreadwiseDynamicTensorSliceTransfer_v1r2
...
@@ -333,7 +333,307 @@ struct ThreadwiseDynamicTensorSliceTransfer_v1r2
back_step
(
i
)
=
(
SliceLengths
{}[
i
-
Number
<
1
>
{}]
%
2
==
0
)
?
0
:
(
1
-
SliceLengths
{}[
i
]);
back_step
(
i
)
=
(
SliceLengths
{}[
i
-
Number
<
1
>
{}]
%
2
==
0
)
?
0
:
(
1
-
SliceLengths
{}[
i
]);
});
});
return
back_step
;
return
back_step
;
}
__device__
void
Run_hack
(
const
SrcDesc
&
src_desc
,
const
SrcData
*
p_src
,
const
DstDesc
&
dst_desc
,
DstData
*
p_dst
)
{
if
constexpr
(
remove_reference_t
<
SrcDesc
>::
GetNumOfDimension
()
==
2
)
{
// TODO use constexpr for coordinate-step to make sure compiler behave correctly
const
auto
src_step_0_p1
=
make_dynamic_tensor_coordinate_step
(
src_desc
,
make_multi_index
(
0
,
1
));
const
auto
src_step_0_m1
=
make_dynamic_tensor_coordinate_step
(
src_desc
,
make_multi_index
(
0
,
-
1
));
const
auto
src_step_p1_0
=
make_dynamic_tensor_coordinate_step
(
src_desc
,
make_multi_index
(
1
,
0
));
const
auto
src_step_m1_0
=
make_dynamic_tensor_coordinate_step
(
src_desc
,
make_multi_index
(
-
1
,
0
));
const
auto
dst_step_0_p1
=
make_dynamic_tensor_coordinate_step
(
dst_desc
,
make_multi_index
(
0
,
1
));
const
auto
dst_step_0_m1
=
make_dynamic_tensor_coordinate_step
(
dst_desc
,
make_multi_index
(
0
,
-
1
));
const
auto
dst_step_p1_0
=
make_dynamic_tensor_coordinate_step
(
dst_desc
,
make_multi_index
(
1
,
0
));
const
auto
dst_step_m1_0
=
make_dynamic_tensor_coordinate_step
(
dst_desc
,
make_multi_index
(
-
1
,
0
));
constexpr
index_t
Len0
=
SliceLengths
{}[
0
];
constexpr
index_t
Len1
=
SliceLengths
{}[
1
];
#pragma unroll
for
(
index_t
iter0
=
0
;
iter0
<
Len0
;
++
iter0
)
{
#pragma unroll
for
(
index_t
iter1
=
0
;
iter1
<
Len1
;
++
iter1
)
{
// do work
transfer_data
<
SrcData
,
1
,
SrcAddressSpace
,
DstAddressSpace
,
DstInMemOp
,
SrcScalarStrideInVector
,
DstScalarStrideInVector
>
(
p_src
,
src_slice_origin_
.
GetOffset
(),
coordinate_has_valid_offset_assuming_visible_index_is_valid
(
src_desc
,
src_slice_origin_
),
src_desc
.
GetElementSpaceSize
(),
p_dst
,
dst_slice_origin_
.
GetOffset
(),
coordinate_has_valid_offset_assuming_visible_index_is_valid
(
dst_desc
,
dst_slice_origin_
),
dst_desc
.
GetElementSpaceSize
());
// move dim1 iterator
if
(
iter1
<
Len1
-
1
)
{
bool
forward_dim1
=
(
iter0
%
2
==
0
);
if
(
forward_dim1
)
{
move_dynamic_tensor_coordinate
(
src_desc
,
src_slice_origin_
,
src_step_0_p1
);
move_dynamic_tensor_coordinate
(
dst_desc
,
dst_slice_origin_
,
dst_step_0_p1
);
}
else
{
move_dynamic_tensor_coordinate
(
src_desc
,
src_slice_origin_
,
src_step_0_m1
);
move_dynamic_tensor_coordinate
(
dst_desc
,
dst_slice_origin_
,
dst_step_0_m1
);
}
}
}
// move dim0 iterator
if
(
iter0
<
Len0
-
1
)
{
move_dynamic_tensor_coordinate
(
src_desc
,
src_slice_origin_
,
src_step_p1_0
);
move_dynamic_tensor_coordinate
(
dst_desc
,
dst_slice_origin_
,
dst_step_p1_0
);
}
}
}
else
if
constexpr
(
remove_reference_t
<
SrcDesc
>::
GetNumOfDimension
()
==
4
)
{
// TODO use constexpr for coordinate-step to make sure compiler behave correctly
#if 0
const auto src_step_0_0_0_p1 =
make_dynamic_tensor_coordinate_step(src_desc, make_multi_index(0, 0, 0, 1));
const auto src_step_0_0_0_m1 =
make_dynamic_tensor_coordinate_step(src_desc, make_multi_index(0, 0, 0, -1));
const auto src_step_0_0_p1_0 =
make_dynamic_tensor_coordinate_step(src_desc, make_multi_index(0, 0, 1, 0));
const auto src_step_0_0_m1_0 =
make_dynamic_tensor_coordinate_step(src_desc, make_multi_index(0, 0, -1, 0));
const auto src_step_0_p1_0_0 =
make_dynamic_tensor_coordinate_step(dst_desc, make_multi_index(0, 1, 0, 0));
const auto src_step_0_m1_0_0 =
make_dynamic_tensor_coordinate_step(dst_desc, make_multi_index(0, -1, 0, 0));
const auto src_step_p1_0_0_0 =
make_dynamic_tensor_coordinate_step(dst_desc, make_multi_index(1, 0, 0, 0));
const auto src_step_m1_0_0_0 =
make_dynamic_tensor_coordinate_step(dst_desc, make_multi_index(-1, 0, 0, 0));
const auto dst_step_0_0_0_p1 =
make_dynamic_tensor_coordinate_step(dst_desc, make_multi_index(0, 0, 0, 1));
const auto dst_step_0_0_0_m1 =
make_dynamic_tensor_coordinate_step(dst_desc, make_multi_index(0, 0, 0, -1));
const auto dst_step_0_0_p1_0 =
make_dynamic_tensor_coordinate_step(dst_desc, make_multi_index(0, 0, 1, 0));
const auto dst_step_0_0_m1_0 =
make_dynamic_tensor_coordinate_step(dst_desc, make_multi_index(0, 0, -1, 0));
const auto dst_step_0_p1_0_0 =
make_dynamic_tensor_coordinate_step(dst_desc, make_multi_index(0, 1, 0, 0));
const auto dst_step_0_m1_0_0 =
make_dynamic_tensor_coordinate_step(dst_desc, make_multi_index(0, -1, 0, 0));
const auto dst_step_p1_0_0_0 =
make_dynamic_tensor_coordinate_step(dst_desc, make_multi_index(1, 0, 0, 0));
const auto dst_step_m1_0_0_0 =
make_dynamic_tensor_coordinate_step(dst_desc, make_multi_index(-1, 0, 0, 0));
#else
// hack for output tensor
const
auto
src_step_0_0_0_p1
=
make_dynamic_tensor_coordinate_step
(
src_desc
,
make_multi_index
(
0
,
0
,
0
,
1
));
const
auto
src_step_0_0_0_m1
=
make_dynamic_tensor_coordinate_step
(
src_desc
,
make_multi_index
(
0
,
0
,
0
,
-
1
));
const
auto
src_step_0_0_p1_0
=
make_dynamic_tensor_coordinate_step
(
src_desc
,
make_multi_index
(
0
,
0
,
1
,
0
));
const
auto
src_step_0_0_m1_0
=
make_dynamic_tensor_coordinate_step
(
src_desc
,
make_multi_index
(
0
,
0
,
-
1
,
0
));
const
auto
src_step_0_p1_0_0
=
make_dynamic_tensor_coordinate_step
(
dst_desc
,
make_multi_index
(
0
,
1
,
0
,
0
));
const
auto
src_step_0_m1_0_0
=
make_dynamic_tensor_coordinate_step
(
dst_desc
,
make_multi_index
(
0
,
-
1
,
0
,
0
));
const
auto
src_step_p1_0_0_0
=
make_dynamic_tensor_coordinate_step
(
dst_desc
,
make_multi_index
(
1
,
0
,
0
,
0
));
const
auto
src_step_m1_0_0_0
=
make_dynamic_tensor_coordinate_step
(
dst_desc
,
make_multi_index
(
-
1
,
0
,
0
,
0
));
const
auto
dst_step_0_0_0_p1
=
make_dynamic_tensor_coordinate_step_hack
(
dst_desc
,
make_multi_index
(
0
,
0
,
0
,
1
),
Sequence
<
0
,
0
,
1
,
0
,
0
>
{});
const
auto
dst_step_0_0_0_m1
=
make_dynamic_tensor_coordinate_step_hack
(
dst_desc
,
make_multi_index
(
0
,
0
,
0
,
-
1
),
Sequence
<
0
,
0
,
2
,
0
,
0
>
{});
const
auto
dst_step_0_0_p1_0
=
make_dynamic_tensor_coordinate_step_hack
(
dst_desc
,
make_multi_index
(
0
,
0
,
1
,
0
),
Sequence
<
0
,
0
,
1
,
0
,
0
>
{});
const
auto
dst_step_0_0_m1_0
=
make_dynamic_tensor_coordinate_step_hack
(
dst_desc
,
make_multi_index
(
0
,
0
,
-
1
,
0
),
Sequence
<
0
,
0
,
2
,
0
,
0
>
{});
const
auto
dst_step_0_p1_0_0
=
make_dynamic_tensor_coordinate_step
(
dst_desc
,
make_multi_index
(
0
,
1
,
0
,
0
));
const
auto
dst_step_0_m1_0_0
=
make_dynamic_tensor_coordinate_step
(
dst_desc
,
make_multi_index
(
0
,
-
1
,
0
,
0
));
const
auto
dst_step_p1_0_0_0
=
make_dynamic_tensor_coordinate_step
(
dst_desc
,
make_multi_index
(
1
,
0
,
0
,
0
));
const
auto
dst_step_m1_0_0_0
=
make_dynamic_tensor_coordinate_step
(
dst_desc
,
make_multi_index
(
-
1
,
0
,
0
,
0
));
#endif
constexpr
index_t
Len0
=
SliceLengths
{}[
0
];
constexpr
index_t
Len1
=
SliceLengths
{}[
1
];
constexpr
index_t
Len2
=
SliceLengths
{}[
2
];
constexpr
index_t
Len3
=
SliceLengths
{}[
3
];
#pragma unroll
for
(
index_t
iter0
=
0
;
iter0
<
Len0
;
++
iter0
)
{
#pragma unroll
for
(
index_t
iter1
=
0
;
iter1
<
Len1
;
++
iter1
)
{
#pragma unroll
for
(
index_t
iter2
=
0
;
iter2
<
Len2
;
++
iter2
)
{
#pragma unroll
for
(
index_t
iter3
=
0
;
iter3
<
Len3
;
++
iter3
)
{
// do work
transfer_data
<
SrcData
,
1
,
SrcAddressSpace
,
DstAddressSpace
,
DstInMemOp
,
SrcScalarStrideInVector
,
DstScalarStrideInVector
>
(
p_src
,
src_slice_origin_
.
GetOffset
(),
coordinate_has_valid_offset_assuming_visible_index_is_valid
(
src_desc
,
src_slice_origin_
),
src_desc
.
GetElementSpaceSize
(),
p_dst
,
dst_slice_origin_
.
GetOffset
(),
coordinate_has_valid_offset_assuming_visible_index_is_valid
(
dst_desc
,
dst_slice_origin_
),
dst_desc
.
GetElementSpaceSize
());
// move dim1 iterator
if
(
iter3
<
Len3
-
1
)
{
bool
forward_dim3
=
(
iter2
%
2
==
0
);
if
(
forward_dim3
)
{
move_dynamic_tensor_coordinate
(
src_desc
,
src_slice_origin_
,
src_step_0_0_0_p1
);
move_dynamic_tensor_coordinate
(
dst_desc
,
dst_slice_origin_
,
dst_step_0_0_0_p1
);
}
else
{
move_dynamic_tensor_coordinate
(
src_desc
,
src_slice_origin_
,
src_step_0_0_0_m1
);
move_dynamic_tensor_coordinate
(
dst_desc
,
dst_slice_origin_
,
dst_step_0_0_0_m1
);
}
}
}
// move dim1 iterator
if
(
iter2
<
Len2
-
1
)
{
bool
forward_dim2
=
(
iter1
%
2
==
0
);
if
(
forward_dim2
)
{
move_dynamic_tensor_coordinate
(
src_desc
,
src_slice_origin_
,
src_step_0_0_p1_0
);
move_dynamic_tensor_coordinate
(
dst_desc
,
dst_slice_origin_
,
dst_step_0_0_p1_0
);
}
else
{
move_dynamic_tensor_coordinate
(
src_desc
,
src_slice_origin_
,
src_step_0_0_m1_0
);
move_dynamic_tensor_coordinate
(
dst_desc
,
dst_slice_origin_
,
dst_step_0_0_m1_0
);
}
}
}
// move dim1 iterator
if
(
iter1
<
Len1
-
1
)
{
bool
forward_dim1
=
(
iter0
%
2
==
0
);
if
(
forward_dim1
)
{
move_dynamic_tensor_coordinate
(
src_desc
,
src_slice_origin_
,
src_step_0_p1_0_0
);
move_dynamic_tensor_coordinate
(
dst_desc
,
dst_slice_origin_
,
dst_step_0_p1_0_0
);
}
else
{
move_dynamic_tensor_coordinate
(
src_desc
,
src_slice_origin_
,
src_step_0_m1_0_0
);
move_dynamic_tensor_coordinate
(
dst_desc
,
dst_slice_origin_
,
dst_step_0_m1_0_0
);
}
}
}
// move dim0 iterator:
if
(
iter0
<
Len0
-
1
)
{
// move forward in dim0
move_dynamic_tensor_coordinate
(
src_desc
,
src_slice_origin_
,
src_step_p1_0_0_0
);
move_dynamic_tensor_coordinate
(
dst_desc
,
dst_slice_origin_
,
dst_step_p1_0_0_0
);
}
}
}
// move src and dst coordinate back to their origins
if
constexpr
(
SrcResetCoordinateAfterRun
)
{
const
auto
src_back_step
=
make_dynamic_tensor_coordinate_step
(
src_desc
,
GetCoordinateBackStep
());
move_dynamic_tensor_coordinate
(
src_desc
,
src_slice_origin_
,
src_back_step
);
}
if
constexpr
(
DstResetCoordinateAfterRun
)
{
const
auto
dst_back_step
=
make_dynamic_tensor_coordinate_step
(
dst_desc
,
GetCoordinateBackStep
());
move_dynamic_tensor_coordinate
(
dst_desc
,
dst_slice_origin_
,
dst_back_step
);
}
}
}
// src_slice_origin_step_idx need to be known at compile-time, for performance reason
// src_slice_origin_step_idx need to be known at compile-time, for performance reason
...
@@ -591,132 +891,7 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
...
@@ -591,132 +891,7 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
}
}
}
}
__device__
static
constexpr
auto
GetCoordinateBackStep
()
__device__
void
RunRead_hack
(
const
SrcDesc
&
src_desc
,
const
SrcData
*
p_src
)
{
MultiIndex
<
nDim
>
back_step
;
back_step
(
Number
<
0
>
{})
=
1
-
SliceLengths
{}[
0
];
static_for
<
1
,
nDim
,
1
>
{}([
&
](
auto
i
)
{
back_step
(
i
)
=
(
SliceLengths
{}[
i
-
Number
<
1
>
{}]
%
2
==
0
)
?
0
:
(
1
-
SliceLengths
{}[
i
]);
});
return
back_step
;
}
// src_slice_origin_step_idx need to be known at compile-time, for performance reason
__device__
void
MoveSrcSliceWindow
(
const
SrcDesc
&
src_desc
,
const
Index
&
src_slice_origin_step_idx
)
{
// if src coord was not reset by RunRead(), then need to adjust the step here
const
auto
adjusted_step_idx
=
SrcResetCoordinateAfterRun
?
src_slice_origin_step_idx
:
src_slice_origin_step_idx
+
GetCoordinateBackStep
();
// is it OK to construct a new step every time?
const
auto
adjusted_step
=
make_dynamic_tensor_coordinate_step
(
src_desc
,
adjusted_step_idx
);
move_dynamic_tensor_coordinate
(
src_desc
,
src_slice_origin_
,
adjusted_step
);
}
// dst_slice_origin_step_idx need to be known at compile-time, for performance reason
__device__
void
MoveDstSliceWindow
(
const
DstDesc
&
dst_desc
,
const
Index
&
dst_slice_origin_step_idx
)
{
// if dst coord was not reset by RunWrite(), then need to adjust the step here
const
auto
adjusted_step_idx
=
DstResetCoordinateAfterRun
?
dst_slice_origin_step_idx
:
dst_slice_origin_step_idx
+
GetCoordinateBackStep
();
// is it OK to construct a new step every time?
const
auto
adjusted_step
=
make_dynamic_tensor_coordinate_step
(
dst_desc
,
adjusted_step_idx
);
move_dynamic_tensor_coordinate
(
dst_desc
,
dst_slice_origin_
,
adjusted_step
);
}
private:
static
constexpr
auto
buffer_desc_
=
make_dynamic_naive_tensor_descriptor_packed
<
nDim
>
(
to_multi_index
(
SliceLengths
{}));
static
constexpr
index_t
buffer_size_
=
buffer_desc_
.
GetElementSpaceSize
();
StaticallyIndexedArray
<
SrcData
,
buffer_size_
>
buffer_
;
SrcCoord
src_slice_origin_
;
DstCoord
dst_slice_origin_
;
};
// this version does following things to avoid "alloca" in LLVM-IR, which would cause scratch memory
// and sometimes useless instructions
// 1. It does not keep reference to tensor descriptor
// 2. It does not construct new tensor coordinate for this->Run()
// 3. It does not use pointer for VGPR thread buffer
// 4. It calculate offset for thread buffer directly, instead of moving the coordinate
template
<
typename
SliceLengths
,
InMemoryDataOperation
DstInMemOp
,
typename
SrcData
,
typename
DstData
,
typename
SrcDesc
,
typename
DstDesc
,
typename
SrcDimAccessOrder
,
typename
DstDimAccessOrder
,
index_t
SrcVectorDim
,
index_t
DstVectorDim
,
index_t
SrcScalarPerVector
,
index_t
DstScalarPerVector
,
index_t
SrcScalarStrideInVector
,
index_t
DstScalarStrideInVector
,
AddressSpace
SrcAddressSpace
,
AddressSpace
DstAddressSpace
,
bool
SrcResetCoordinateAfterRun
,
// control whether to move back src coordinate after each
// RunRead(), will be fused with MoveSrcSliceWindow to
// save addr computation
bool
DstResetCoordinateAfterRun
>
// control whether to move back dst coordinate after each
// RunWrite(), will be fused with MoveDstSliceWindow to
// save addr computation
struct
ThreadwiseDynamicTensorSliceTransfer_v3_hack
{
static
constexpr
index_t
nDim
=
SliceLengths
::
Size
();
using
Index
=
MultiIndex
<
nDim
>
;
using
SrcCoord
=
decltype
(
make_dynamic_tensor_coordinate
(
SrcDesc
{},
Index
{}));
using
DstCoord
=
decltype
(
make_dynamic_tensor_coordinate
(
DstDesc
{},
Index
{}));
using
SrcCoordStep
=
decltype
(
make_dynamic_tensor_coordinate_step
(
SrcDesc
{},
Index
{}));
using
DstCoordStep
=
decltype
(
make_dynamic_tensor_coordinate_step
(
DstDesc
{},
Index
{}));
__device__
constexpr
ThreadwiseDynamicTensorSliceTransfer_v3_hack
(
const
SrcDesc
&
src_desc
,
const
Index
&
src_slice_origin
,
const
DstDesc
&
dst_desc
,
const
Index
&
dst_slice_origin
)
:
src_slice_origin_
(
make_dynamic_tensor_coordinate
(
src_desc
,
src_slice_origin
)),
dst_slice_origin_
(
make_dynamic_tensor_coordinate
(
dst_desc
,
dst_slice_origin
))
{
static_assert
(
SrcAddressSpace
==
AddressSpace
::
Global
or
SrcAddressSpace
==
AddressSpace
::
Lds
,
"wrong!"
);
static_assert
(
DstAddressSpace
==
AddressSpace
::
Global
or
DstAddressSpace
==
AddressSpace
::
Lds
,
"wrong!"
);
}
__device__
constexpr
ThreadwiseDynamicTensorSliceTransfer_v3_hack
()
:
ThreadwiseDynamicTensorSliceTransfer_v3_hack
(
SrcDesc
{},
make_zero_multi_index
<
nDim
>
(),
DstDesc
{},
make_zero_multi_index
<
nDim
>
())
{
}
__device__
void
SetSrcSliceOrigin
(
const
SrcDesc
&
src_desc
,
const
Index
&
src_slice_origin_idx
)
{
src_slice_origin_
=
make_dynamic_tensor_coordinate
(
src_desc
,
src_slice_origin_idx
);
}
__device__
void
SetDstSliceOrigin
(
const
DstDesc
&
dst_desc
,
const
Index
&
dst_slice_origin_idx
)
{
dst_slice_origin_
=
make_dynamic_tensor_coordinate
(
dst_desc
,
dst_slice_origin_idx
);
}
__device__
void
RunRead
(
const
SrcDesc
&
src_desc
,
const
SrcData
*
p_src
)
{
{
static_assert
(
remove_reference_t
<
SrcDesc
>::
GetNumOfDimension
()
==
2
,
static_assert
(
remove_reference_t
<
SrcDesc
>::
GetNumOfDimension
()
==
2
,
"wrong! hardcoded for 2D tensor"
);
"wrong! hardcoded for 2D tensor"
);
...
@@ -736,44 +911,18 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3_hack
...
@@ -736,44 +911,18 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3_hack
make_dynamic_tensor_coordinate_step(src_desc, make_multi_index(1, 0));
make_dynamic_tensor_coordinate_step(src_desc, make_multi_index(1, 0));
const auto src_step_m1_0 =
const auto src_step_m1_0 =
make_dynamic_tensor_coordinate_step(src_desc, make_multi_index(-1, 0));
make_dynamic_tensor_coordinate_step(src_desc, make_multi_index(-1, 0));
#elif
0
const
auto
src_step_0_p1
=
make_dynamic_tensor_coordinate_step_hack
(
src_desc
,
make_multi_index
(
0
,
1
),
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{});
const
auto
src_step_0_m1
=
make_dynamic_tensor_coordinate_step_hack
(
src_desc
,
make_multi_index
(
0
,
-
1
),
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{});
const
auto
src_step_p1_0
=
make_dynamic_tensor_coordinate_step_hack
(
src_desc
,
make_multi_index
(
1
,
0
),
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{});
const
auto
src_step_m1_0
=
make_dynamic_tensor_coordinate_step_hack
(
src_desc
,
make_multi_index
(
-
1
,
0
),
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{});
#elif
1
#elif
1
// for padded input tensor
// for padded input tensor
const
auto
src_step_0_p1
=
make_dynamic_tensor_coordinate_step_hack
(
const
auto
src_step_0_p1
=
make_dynamic_tensor_coordinate_step_hack
(
src_desc
,
src_desc
,
make_multi_index
(
0
,
1
),
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
1
>
{});
make_multi_index
(
0
,
1
),
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
1
>
{});
const
auto
src_step_0_m1
=
make_dynamic_tensor_coordinate_step_hack
(
const
auto
src_step_0_m1
=
make_dynamic_tensor_coordinate_step_hack
(
src_desc
,
src_desc
,
make_multi_index
(
0
,
-
1
),
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
2
>
{});
make_multi_index
(
0
,
-
1
),
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
2
>
{});
const
auto
src_step_p1_0
=
make_dynamic_tensor_coordinate_step_hack
(
const
auto
src_step_p1_0
=
make_dynamic_tensor_coordinate_step_hack
(
src_desc
,
src_desc
,
make_multi_index
(
1
,
0
),
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
1
,
0
>
{});
make_multi_index
(
1
,
0
),
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
1
,
0
>
{});
const
auto
src_step_m1_0
=
make_dynamic_tensor_coordinate_step_hack
(
const
auto
src_step_m1_0
=
make_dynamic_tensor_coordinate_step_hack
(
src_desc
,
src_desc
,
make_multi_index
(
-
1
,
0
),
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
2
,
0
>
{});
make_multi_index
(
-
1
,
0
),
#elif 0
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
2
,
0
>
{});
#elif 1
// for non-padded input tensor
// for non-padded input tensor
const
auto
src_step_0_p1
=
make_dynamic_tensor_coordinate_step_hack
(
const
auto
src_step_0_p1
=
make_dynamic_tensor_coordinate_step_hack
(
src_desc
,
make_multi_index
(
0
,
1
),
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
1
>
{});
src_desc
,
make_multi_index
(
0
,
1
),
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
1
>
{});
...
@@ -806,12 +955,23 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3_hack
...
@@ -806,12 +955,23 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3_hack
static_assert
(
SrcAddressSpace
==
AddressSpace
::
Global
,
static_assert
(
SrcAddressSpace
==
AddressSpace
::
Global
,
"wrong! hardcoded to use buffer_load, src must be global mem"
);
"wrong! hardcoded to use buffer_load, src must be global mem"
);
#if 0 // debug
buffer_(Number<buffer_offset>{}) = amd_buffer_load<SrcData, 1>(
buffer_(Number<buffer_offset>{}) = amd_buffer_load<SrcData, 1>(
p_src,
p_src,
src_slice_origin_.GetOffset(),
src_slice_origin_.GetOffset(),
coordinate_has_valid_offset_assuming_visible_index_is_valid(
coordinate_has_valid_offset_assuming_visible_index_is_valid(
src_desc, src_slice_origin_),
src_desc, src_slice_origin_),
src_desc.GetElementSpaceSize());
src_desc.GetElementSpaceSize());
#else
SrcData
tmp
=
amd_buffer_load
<
SrcData
,
1
>
(
p_src
,
src_slice_origin_
.
GetOffset
(),
true
,
src_desc
.
GetElementSpaceSize
());
const
bool
is_valid
=
coordinate_has_valid_offset_assuming_visible_index_is_valid
(
src_desc
,
src_slice_origin_
);
buffer_
(
Number
<
buffer_offset
>
{})
=
is_valid
?
tmp
:
SrcData
{
0
};
#endif
// move dim1 iterator
// move dim1 iterator
if
constexpr
(
iter1
.
value
<
Len1
-
1
)
if
constexpr
(
iter1
.
value
<
Len1
-
1
)
...
@@ -847,83 +1007,6 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3_hack
...
@@ -847,83 +1007,6 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3_hack
}
}
}
}
__device__
void
RunWrite
(
const
DstDesc
&
dst_desc
,
DstData
*
p_dst
)
{
static_assert
(
remove_reference_t
<
DstDesc
>::
GetNumOfDimension
()
==
2
,
"wrong! hardcoded for 2D tensor"
);
// hardcoded for 2D
// TODO implement N-D
if
constexpr
(
remove_reference_t
<
SrcDesc
>::
GetNumOfDimension
()
==
2
)
{
// TODO use constexpr for coordinate-step to make sure compiler behave correctly
const
auto
dst_step_0_p1
=
make_dynamic_tensor_coordinate_step
(
dst_desc
,
make_multi_index
(
0
,
1
));
const
auto
dst_step_0_m1
=
make_dynamic_tensor_coordinate_step
(
dst_desc
,
make_multi_index
(
0
,
-
1
));
const
auto
dst_step_p1_0
=
make_dynamic_tensor_coordinate_step
(
dst_desc
,
make_multi_index
(
1
,
0
));
const
auto
dst_step_m1_0
=
make_dynamic_tensor_coordinate_step
(
dst_desc
,
make_multi_index
(
-
1
,
0
));
constexpr
index_t
Len0
=
SliceLengths
{}[
0
];
constexpr
index_t
Len1
=
SliceLengths
{}[
1
];
static_for
<
0
,
Len0
,
1
>
{}([
&
](
auto
iter0
)
{
static_for
<
0
,
Len1
,
1
>
{}([
&
](
auto
iter1
)
{
// step direction
constexpr
bool
forward_dim1
=
(
iter0
.
value
%
2
==
0
);
constexpr
index_t
i0
=
iter0
;
constexpr
index_t
i1
=
forward_dim1
?
iter1
.
value
:
Len1
-
iter1
.
value
-
1
;
// do work
constexpr
index_t
buffer_offset
=
buffer_desc_
.
CalculateOffset
(
make_multi_index
(
i0
,
i1
));
// hardcoding for ds_write
// TODO refactor transfer_data() to encapsulate this
static_assert
(
DstAddressSpace
==
AddressSpace
::
Lds
&&
DstInMemOp
==
InMemoryDataOperation
::
Set
,
"wrong! hardcoded for ds_write"
);
p_dst
[
dst_slice_origin_
.
GetOffset
()]
=
buffer_
[
Number
<
buffer_offset
>
{}];
// move dim1 iterator
if
constexpr
(
iter1
.
value
<
Len1
-
1
)
{
if
constexpr
(
forward_dim1
)
{
move_dynamic_tensor_coordinate
(
dst_desc
,
dst_slice_origin_
,
dst_step_0_p1
);
}
else
{
move_dynamic_tensor_coordinate
(
dst_desc
,
dst_slice_origin_
,
dst_step_0_m1
);
}
}
});
// move dim0 iterator
if
constexpr
(
iter0
.
value
<
Len0
-
1
)
{
move_dynamic_tensor_coordinate
(
dst_desc
,
dst_slice_origin_
,
dst_step_p1_0
);
}
});
}
// move dst coordinate back to its slice origin
if
constexpr
(
DstResetCoordinateAfterRun
)
{
const
auto
dst_back_step
=
make_dynamic_tensor_coordinate_step
(
dst_desc
,
GetCoordinateBackStep
());
move_dynamic_tensor_coordinate
(
dst_desc
,
dst_slice_origin_
,
dst_back_step
);
}
}
__device__
static
constexpr
auto
GetCoordinateBackStep
()
__device__
static
constexpr
auto
GetCoordinateBackStep
()
{
{
MultiIndex
<
nDim
>
back_step
;
MultiIndex
<
nDim
>
back_step
;
...
@@ -947,18 +1030,7 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3_hack
...
@@ -947,18 +1030,7 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3_hack
:
src_slice_origin_step_idx
+
GetCoordinateBackStep
();
:
src_slice_origin_step_idx
+
GetCoordinateBackStep
();
// is it OK to construct a new step every time?
// is it OK to construct a new step every time?
#if 0 // hack
const
auto
adjusted_step
=
make_dynamic_tensor_coordinate_step
(
src_desc
,
adjusted_step_idx
);
const auto adjusted_step = make_dynamic_tensor_coordinate_step(
src_desc, adjusted_step_idx);
#elif
1
// for padded input tensor
const
auto
adjusted_step
=
make_dynamic_tensor_coordinate_step_hack
(
src_desc
,
adjusted_step_idx
,
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
1
,
2
>
{});
#elif 1
// for non-paded input tensor
const
auto
adjusted_step
=
make_dynamic_tensor_coordinate_step_hack
(
src_desc
,
adjusted_step_idx
,
Sequence
<
0
,
0
,
0
,
0
,
0
,
1
,
2
>
{});
#endif
move_dynamic_tensor_coordinate
(
src_desc
,
src_slice_origin_
,
adjusted_step
);
move_dynamic_tensor_coordinate
(
src_desc
,
src_slice_origin_
,
adjusted_step
);
}
}
...
@@ -978,6 +1050,32 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3_hack
...
@@ -978,6 +1050,32 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3_hack
move_dynamic_tensor_coordinate
(
dst_desc
,
dst_slice_origin_
,
adjusted_step
);
move_dynamic_tensor_coordinate
(
dst_desc
,
dst_slice_origin_
,
adjusted_step
);
}
}
// src_slice_origin_step_idx need to be known at compile-time, for performance reason
__device__
void
MoveSrcSliceWindow_hack
(
const
SrcDesc
&
src_desc
,
const
Index
&
src_slice_origin_step_idx
)
{
// if src coord was not reset by RunRead(), then need to adjust the step here
const
auto
adjusted_step_idx
=
SrcResetCoordinateAfterRun
?
src_slice_origin_step_idx
:
src_slice_origin_step_idx
+
GetCoordinateBackStep
();
// is it OK to construct a new step every time?
#if 0 // hack
const auto adjusted_step = make_dynamic_tensor_coordinate_step(
src_desc, adjusted_step_idx);
#elif
1
// for padded input tensor
const
auto
adjusted_step
=
make_dynamic_tensor_coordinate_step_hack
(
src_desc
,
adjusted_step_idx
,
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
1
,
2
>
{});
#elif 0
// for non-paded input tensor
const
auto
adjusted_step
=
make_dynamic_tensor_coordinate_step_hack
(
src_desc
,
adjusted_step_idx
,
Sequence
<
0
,
0
,
0
,
0
,
0
,
1
,
2
>
{});
#endif
move_dynamic_tensor_coordinate
(
src_desc
,
src_slice_origin_
,
adjusted_step
);
}
private:
private:
static
constexpr
auto
buffer_desc_
=
static
constexpr
auto
buffer_desc_
=
make_dynamic_naive_tensor_descriptor_packed
<
nDim
>
(
to_multi_index
(
SliceLengths
{}));
make_dynamic_naive_tensor_descriptor_packed
<
nDim
>
(
to_multi_index
(
SliceLengths
{}));
...
@@ -989,5 +1087,6 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3_hack
...
@@ -989,5 +1087,6 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3_hack
SrcCoord
src_slice_origin_
;
SrcCoord
src_slice_origin_
;
DstCoord
dst_slice_origin_
;
DstCoord
dst_slice_origin_
;
};
};
}
// namespace ck
}
// namespace ck
#endif
#endif
composable_kernel/include/utility/amd_buffer_addressing.hpp
View file @
77c81617
...
@@ -152,10 +152,15 @@ __device__ float amd_buffer_load<float, 1>(const float* p_src_wave,
...
@@ -152,10 +152,15 @@ __device__ float amd_buffer_load<float, 1>(const float* p_src_wave,
return
__llvm_amdgcn_buffer_load_f32
(
return
__llvm_amdgcn_buffer_load_f32
(
src_wave_buffer_resource
.
data
,
0
,
src_addr_shift
+
src_thread_addr_offset
,
false
,
false
);
src_wave_buffer_resource
.
data
,
0
,
src_addr_shift
+
src_thread_addr_offset
,
false
,
false
);
#else
#else
#if 1 // debug
float
tmp
=
__llvm_amdgcn_buffer_load_f32
(
float
tmp
=
__llvm_amdgcn_buffer_load_f32
(
src_wave_buffer_resource
.
data
,
0
,
src_thread_addr_offset
,
false
,
false
);
src_wave_buffer_resource
.
data
,
0
,
src_thread_addr_offset
,
false
,
false
);
return
src_thread_data_valid
?
tmp
:
float
(
0
);
return
src_thread_data_valid
?
tmp
:
float
(
0
);
#else
return
__llvm_amdgcn_buffer_load_f32
(
src_wave_buffer_resource
.
data
,
0
,
src_thread_addr_offset
,
false
,
false
);
#endif
#endif
#endif
}
}
...
...
composable_kernel/include/utility/config.amd.hpp.in
View file @
77c81617
...
@@ -87,7 +87,7 @@
...
@@ -87,7 +87,7 @@
// thread-invariant, otherwise it's a bug
// thread-invariant, otherwise it's a bug
// TODO: separate index calculation into "compile-time", "global", "block", "wave", "thread"
// TODO: separate index calculation into "compile-time", "global", "block", "wave", "thread"
#ifndef CK_HACK_DYNAMIC_MERGE_CALCULATE_IDX_DIFF_LOW_CONST_USE_AMD_GCN_READ_FIRST_LANE
#ifndef CK_HACK_DYNAMIC_MERGE_CALCULATE_IDX_DIFF_LOW_CONST_USE_AMD_GCN_READ_FIRST_LANE
#define CK_HACK_DYNAMIC_MERGE_CALCULATE_IDX_DIFF_LOW_CONST_USE_AMD_GCN_READ_FIRST_LANE
0
#define CK_HACK_DYNAMIC_MERGE_CALCULATE_IDX_DIFF_LOW_CONST_USE_AMD_GCN_READ_FIRST_LANE
1
#endif
#endif
// workaround: put all workaround here
// workaround: put all workaround here
...
...
composable_kernel/include/utility/sequence.hpp
View file @
77c81617
...
@@ -750,6 +750,13 @@ __host__ __device__ constexpr auto reverse_inclusive_scan_sequence(Seq, Reduce,
...
@@ -750,6 +750,13 @@ __host__ __device__ constexpr auto reverse_inclusive_scan_sequence(Seq, Reduce,
return
typename
sequence_reverse_inclusive_scan
<
Seq
,
Reduce
,
Init
>::
type
{};
return
typename
sequence_reverse_inclusive_scan
<
Seq
,
Reduce
,
Init
>::
type
{};
}
}
template
<
typename
Seq
,
typename
Reduce
,
index_t
Init
>
__host__
__device__
constexpr
auto
reverse_exclusive_scan_sequence
(
Seq
,
Reduce
,
Number
<
Init
>
)
{
return
reverse_inclusive_scan_sequence
(
Seq
::
PopFront
(),
Reduce
{},
Number
<
Init
>
{})
.
PushBack
(
Number
<
Init
>
{});
}
template
<
typename
Seq
,
typename
Reduce
,
index_t
Init
>
template
<
typename
Seq
,
typename
Reduce
,
index_t
Init
>
__host__
__device__
constexpr
auto
inclusive_scan_sequence
(
Seq
,
Reduce
,
Number
<
Init
>
)
__host__
__device__
constexpr
auto
inclusive_scan_sequence
(
Seq
,
Reduce
,
Number
<
Init
>
)
{
{
...
...
driver/include/device_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
View file @
77c81617
...
@@ -155,6 +155,7 @@ void device_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc,
...
@@ -155,6 +155,7 @@ void device_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc,
constexpr
index_t
GemmCThreadCopyDstDataPerWrite_GemmN1
=
1
;
constexpr
index_t
GemmCThreadCopyDstDataPerWrite_GemmN1
=
1
;
#elif 1
#elif 1
// cdata = 64, BlockSize = 256, 128x128x8
// cdata = 64, BlockSize = 256, 128x128x8
// b threadwise copy 4x1
constexpr
index_t
BlockSize
=
256
;
constexpr
index_t
BlockSize
=
256
;
constexpr
index_t
GemmMPerBlock
=
128
;
constexpr
index_t
GemmMPerBlock
=
128
;
...
@@ -185,6 +186,40 @@ void device_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc,
...
@@ -185,6 +186,40 @@ void device_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc,
constexpr
index_t
GemmBBlockCopySrcDataPerRead_GemmN
=
1
;
constexpr
index_t
GemmBBlockCopySrcDataPerRead_GemmN
=
1
;
constexpr
index_t
GemmBBlockCopyDstDataPerWrite_GemmN
=
1
;
constexpr
index_t
GemmBBlockCopyDstDataPerWrite_GemmN
=
1
;
constexpr
index_t
GemmCThreadCopyDstDataPerWrite_GemmN1
=
1
;
#elif 1
// cdata = 64, BlockSize = 256, 128x128x8
// b threadwise copy 2x2
constexpr
index_t
BlockSize
=
256
;
constexpr
index_t
GemmMPerBlock
=
128
;
constexpr
index_t
GemmNPerBlock
=
128
;
constexpr
index_t
GemmKPerBlock
=
8
;
constexpr
index_t
GemmMPerThread
=
4
;
constexpr
index_t
GemmNPerThread
=
4
;
constexpr
index_t
GemmKPerThread
=
1
;
constexpr
index_t
GemmMLevel0Cluster
=
2
;
constexpr
index_t
GemmNLevel0Cluster
=
2
;
constexpr
index_t
GemmMLevel1Cluster
=
8
;
constexpr
index_t
GemmNLevel1Cluster
=
8
;
constexpr
index_t
ThreadGemmDataPerReadM
=
4
;
constexpr
index_t
ThreadGemmDataPerReadN
=
4
;
using
GemmABlockCopyThreadSliceLengths_GemmK_GemmM
=
Sequence
<
4
,
1
>
;
using
GemmABlockCopyThreadClusterLengths_GemmK_GemmM
=
Sequence
<
2
,
128
>
;
constexpr
index_t
GemmABlockCopySrcDataPerRead_GemmK
=
4
;
constexpr
index_t
GemmABlockCopyDstDataPerWrite_GemmM
=
1
;
using
GemmBBlockCopyThreadSliceLengths_GemmK_GemmN
=
Sequence
<
2
,
2
>
;
using
GemmBBlockCopyThreadClusterLengths_GemmK_GemmN
=
Sequence
<
4
,
64
>
;
constexpr
index_t
GemmBBlockCopySrcDataPerRead_GemmN
=
1
;
constexpr
index_t
GemmBBlockCopyDstDataPerWrite_GemmN
=
1
;
constexpr
index_t
GemmCThreadCopyDstDataPerWrite_GemmN1
=
1
;
constexpr
index_t
GemmCThreadCopyDstDataPerWrite_GemmN1
=
1
;
#elif 1
#elif 1
// cdata = 64, BlockSize = 256, 128x128x8
// cdata = 64, BlockSize = 256, 128x128x8
...
...
driver/include/device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
View file @
77c81617
...
@@ -41,12 +41,13 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc
...
@@ -41,12 +41,13 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc
wei_kcyx_device_buf
.
ToDevice
(
wei_kcyx
.
mData
.
data
());
wei_kcyx_device_buf
.
ToDevice
(
wei_kcyx
.
mData
.
data
());
out_nkhw_device_buf
.
ToDevice
(
out_nkhw
.
mData
.
data
());
out_nkhw_device_buf
.
ToDevice
(
out_nkhw
.
mData
.
data
());
const
auto
in_n_c_hi_wi_desc
=
make_dynamic_naive_tensor_descriptor
<
4
>
(
// assume packed tensor
to_multi_index
(
InDesc
::
GetLengths
()),
to_multi_index
(
InDesc
::
GetStrides
()));
const
auto
in_n_c_hi_wi_desc
=
const
auto
wei_k_c_y_x_desc
=
make_dynamic_naive_tensor_descriptor
<
4
>
(
make_dynamic_naive_tensor_descriptor_packed
<
4
>
(
to_multi_index
(
InDesc
::
GetLengths
()));
to_multi_index
(
WeiDesc
::
GetLengths
()),
to_multi_index
(
WeiDesc
::
GetStrides
()));
const
auto
wei_k_c_y_x_desc
=
const
auto
out_n_k_ho_wo_desc
=
make_dynamic_naive_tensor_descriptor
<
4
>
(
make_dynamic_naive_tensor_descriptor_packed
<
4
>
(
to_multi_index
(
WeiDesc
::
GetLengths
()));
to_multi_index
(
OutDesc
::
GetLengths
()),
to_multi_index
(
OutDesc
::
GetStrides
()));
const
auto
out_n_k_ho_wo_desc
=
make_dynamic_naive_tensor_descriptor_packed
<
4
>
(
to_multi_index
(
OutDesc
::
GetLengths
()));
const
auto
conv_strides
=
to_multi_index
(
ConvStrides
{});
const
auto
conv_strides
=
to_multi_index
(
ConvStrides
{});
const
auto
conv_dilations
=
to_multi_index
(
ConvDilations
{});
const
auto
conv_dilations
=
to_multi_index
(
ConvDilations
{});
...
@@ -115,6 +116,7 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc
...
@@ -115,6 +116,7 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc
constexpr
index_t
GemmCThreadTransferDstScalarPerVector_GemmN1
=
1
;
constexpr
index_t
GemmCThreadTransferDstScalarPerVector_GemmN1
=
1
;
#elif 1
#elif 1
// cdata = 64, BlockSize = 256, 128x128x8
// cdata = 64, BlockSize = 256, 128x128x8
// b thread copy 4x1
constexpr
index_t
BlockSize
=
256
;
constexpr
index_t
BlockSize
=
256
;
constexpr
index_t
GemmMPerBlock
=
128
;
constexpr
index_t
GemmMPerBlock
=
128
;
...
@@ -142,6 +144,37 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc
...
@@ -142,6 +144,37 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc
constexpr
index_t
GemmBBlockTransferSrcScalarPerVector_GemmN
=
1
;
constexpr
index_t
GemmBBlockTransferSrcScalarPerVector_GemmN
=
1
;
constexpr
index_t
GemmBBlockTransferDstScalarPerVector_GemmN
=
1
;
constexpr
index_t
GemmBBlockTransferDstScalarPerVector_GemmN
=
1
;
constexpr
index_t
GemmCThreadTransferDstScalarPerVector_GemmN1
=
1
;
#elif 1
// cdata = 64, BlockSize = 256, 128x128x8
// b thread copy 2x2
constexpr
index_t
BlockSize
=
256
;
constexpr
index_t
GemmMPerBlock
=
128
;
constexpr
index_t
GemmNPerBlock
=
128
;
constexpr
index_t
GemmKPerBlock
=
8
;
constexpr
index_t
GemmMPerThread
=
4
;
constexpr
index_t
GemmNPerThread
=
4
;
constexpr
index_t
GemmKPerThread
=
1
;
constexpr
index_t
GemmMLevel0Cluster
=
2
;
constexpr
index_t
GemmNLevel0Cluster
=
2
;
constexpr
index_t
GemmMLevel1Cluster
=
8
;
constexpr
index_t
GemmNLevel1Cluster
=
8
;
using
GemmABlockTransferThreadSliceLengths_GemmK_GemmM
=
Sequence
<
4
,
1
>
;
using
GemmABlockTransferThreadClusterLengths_GemmK_GemmM
=
Sequence
<
2
,
128
>
;
constexpr
index_t
GemmABlockTransferSrcScalarPerVector_GemmK
=
1
;
constexpr
index_t
GemmABlockTransferDstScalarPerVector_GemmM
=
1
;
using
GemmBBlockTransferThreadSliceLengths_GemmK_GemmN
=
Sequence
<
2
,
2
>
;
using
GemmBBlockTransferThreadClusterLengths_GemmK_GemmN
=
Sequence
<
4
,
64
>
;
constexpr
index_t
GemmBBlockTransferSrcScalarPerVector_GemmN
=
1
;
constexpr
index_t
GemmBBlockTransferDstScalarPerVector_GemmN
=
1
;
constexpr
index_t
GemmCThreadTransferDstScalarPerVector_GemmN1
=
1
;
constexpr
index_t
GemmCThreadTransferDstScalarPerVector_GemmN1
=
1
;
#endif
#endif
...
@@ -169,7 +202,7 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc
...
@@ -169,7 +202,7 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc
constexpr
auto
conv_driver
=
constexpr
auto
conv_driver
=
#if 1 // debug
#if 1 // debug
DriverDynamicConvolutionForwardImplicitGemm_v4r4_nchw_kcyx_nkhw
DriverDynamicConvolutionForwardImplicitGemm_v4r4_nchw_kcyx_nkhw
_pad
#else
#else
DriverDynamicConvolutionForwardImplicitGemm_v4r4_nchw_kcyx_nkhw_no_pad
DriverDynamicConvolutionForwardImplicitGemm_v4r4_nchw_kcyx_nkhw_no_pad
#endif
#endif
...
...
driver/src/conv_driver.cpp
View file @
77c81617
...
@@ -217,7 +217,7 @@ int main(int argc, char* argv[])
...
@@ -217,7 +217,7 @@ int main(int argc, char* argv[])
using
LeftPads
=
Sequence
<
0
,
0
>
;
using
LeftPads
=
Sequence
<
0
,
0
>
;
using
RightPads
=
Sequence
<
0
,
0
>
;
using
RightPads
=
Sequence
<
0
,
0
>
;
#elif
1
#elif
0
// 3x3, 35x35, stride 2
// 3x3, 35x35, stride 2
constexpr
index_t
N
=
128
;
constexpr
index_t
N
=
128
;
constexpr
index_t
C
=
288
;
constexpr
index_t
C
=
288
;
...
@@ -352,7 +352,7 @@ int main(int argc, char* argv[])
...
@@ -352,7 +352,7 @@ int main(int argc, char* argv[])
using
LeftPads
=
Sequence
<
0
,
0
>
;
using
LeftPads
=
Sequence
<
0
,
0
>
;
using
RightPads
=
Sequence
<
0
,
0
>
;
using
RightPads
=
Sequence
<
0
,
0
>
;
#elif
0
#elif
1
// 3x3, 28x28
// 3x3, 28x28
constexpr
index_t
N
=
128
;
constexpr
index_t
N
=
128
;
constexpr
index_t
C
=
128
;
constexpr
index_t
C
=
128
;
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment