Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
c1ed17f8
Commit
c1ed17f8
authored
Jan 17, 2021
by
Chao Liu
Browse files
improving index calculation: change to UpdateIndexDiff()
parent
77c81617
Changes
7
Show whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
120 additions
and
752 deletions
+120
-752
composable_kernel/include/driver/driver_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
...convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
+0
-11
composable_kernel/include/tensor_description/dynamic_multi_index_transform.hpp
...lude/tensor_description/dynamic_multi_index_transform.hpp
+113
-720
composable_kernel/include/tensor_description/dynamic_tensor_descriptor.hpp
.../include/tensor_description/dynamic_tensor_descriptor.hpp
+3
-12
composable_kernel/include/tensor_operation/gridwise_dynamic_gemm.hpp
...kernel/include/tensor_operation/gridwise_dynamic_gemm.hpp
+0
-5
composable_kernel/include/tensor_operation/threadwise_dynamic_tensor_slice_transfer.hpp
...or_operation/threadwise_dynamic_tensor_slice_transfer.hpp
+2
-2
composable_kernel/include/utility/amd_buffer_addressing.hpp
composable_kernel/include/utility/amd_buffer_addressing.hpp
+1
-1
driver/include/device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
...convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
+1
-1
No files found.
composable_kernel/include/driver/driver_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
View file @
c1ed17f8
...
...
@@ -157,7 +157,6 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v4r4_nchw_kcyx_nkhw_pad
const
index_t
GemmM0
=
GemmM
/
GemmM1
;
const
index_t
GemmN0
=
GemmN
/
GemmN1
;
#if 1 // debug
const
auto
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc
=
transform_dynamic_tensor_descriptor
(
out_gemmm_gemmn_global_desc
,
...
...
@@ -165,16 +164,6 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v4r4_nchw_kcyx_nkhw_pad
DynamicUnMerge
<
2
>
{
make_multi_index
(
GemmN0
,
GemmN1
)}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}),
make_tuple
(
Sequence
<
0
,
1
>
{},
Sequence
<
2
,
3
>
{}));
#else
const
auto
out_gemmm0_gemmm1_gemmn0_gemmn1_global_desc
=
transform_dynamic_tensor_descriptor
(
out_gemmm_gemmn_global_desc
,
make_tuple
(
HackSemiDynamicUnMerge
<
3
,
Sequence
<
GemmM1
>>
{
make_multi_index
(
1
,
GemmM0
)},
HackSemiDynamicUnMerge
<
3
,
Sequence
<
GemmN1
>>
{
make_multi_index
(
1
,
GemmN0
)}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}),
make_tuple
(
Sequence
<
0
,
1
>
{},
Sequence
<
2
,
3
>
{}));
#endif
// GEMM
using
gridwise_gemm
=
GridwiseDynamicGemm_km_kn_mn_v1
<
...
...
composable_kernel/include/tensor_description/dynamic_multi_index_transform.hpp
View file @
c1ed17f8
...
...
@@ -13,11 +13,6 @@ struct DynamicPassThrough
const
UpperIndex
up_lengths_
;
#if 0
__host__ __device__ constexpr DynamicPassThrough(const DynamicPassThrough&) = default;
__host__ __device__ constexpr DynamicPassThrough(DynamicPassThrough&&) = default;
#else
__host__
__device__
constexpr
DynamicPassThrough
(
const
DynamicPassThrough
&
other
)
:
up_lengths_
{
other
.
up_lengths_
}
{
...
...
@@ -27,7 +22,6 @@ struct DynamicPassThrough
:
up_lengths_
{
other
.
up_lengths_
}
{
}
#endif
__host__
__device__
constexpr
DynamicPassThrough
(
const
index_t
&
low_length
)
:
up_lengths_
{
make_multi_index
(
low_length
)}
...
...
@@ -51,31 +45,26 @@ struct DynamicPassThrough
idx_low
(
Number
<
0
>
{})
=
idx_up
[
Number
<
0
>
{}];
}
template
<
typename
LowIdxDiff
,
typename
UpIdxDiff
,
typename
LowIdx
,
typename
UpIdx
>
__host__
__device__
static
void
CalculateLowerIndexDiff
(
LowIdxDiff
&
idx_diff_low
,
const
UpIdxDiff
&
idx_diff_up
,
const
LowIdx
&
/* idx_low_old */
,
const
UpIdx
&
/* idx_up_old */
)
{
static_assert
(
LowIdxDiff
::
Size
()
==
1
&&
UpIdxDiff
::
Size
()
==
1
&&
LowIdx
::
Size
()
==
1
&&
UpIdx
::
Size
()
==
1
,
"wrong! inconsistent # of dimension"
);
idx_diff_low
(
Number
<
0
>
{})
=
idx_diff_up
[
Number
<
0
>
{}];
}
template
<
typename
LowIdxDiff
,
typename
UpIdxDiff
,
typename
LowIdx
,
typename
UpIdx
,
index_t
Hack
>
__host__
__device__
static
void
Calcul
ateLowerIndex
Diff_hack
(
LowIdxDiff
&
idx_diff_low
,
__host__
__device__
static
void
Upd
ateLowerIndex
(
LowIdxDiff
&
idx_diff_low
,
const
UpIdxDiff
&
idx_diff_up
,
const
LowIdx
&
idx_low
_old
,
const
UpIdx
&
idx_up_
old
,
LowIdx
&
idx_low
,
const
UpIdx
&
idx_up_
new
,
Number
<
Hack
>
)
{
CalculateLowerIndexDiff
(
idx_diff_low
,
idx_diff_up
,
idx_low_old
,
idx_up_old
);
static_assert
(
LowIdxDiff
::
Size
()
==
1
&&
UpIdxDiff
::
Size
()
==
1
&&
LowIdx
::
Size
()
==
1
&&
UpIdx
::
Size
()
==
1
,
"wrong! inconsistent # of dimension"
);
constexpr
auto
I0
=
Number
<
0
>
{};
idx_diff_low
(
I0
)
=
idx_diff_up
[
I0
];
idx_low
+=
idx_diff_low
;
}
__host__
__device__
static
constexpr
bool
IsLinearTransform
()
{
return
true
;
}
...
...
@@ -103,11 +92,6 @@ struct DynamicPad
const
index_t
left_pad_
;
const
index_t
right_pad_
;
#if 0
__host__ __device__ constexpr DynamicPad(const DynamicPad&) = default;
__host__ __device__ constexpr DynamicPad(DynamicPad&&) = default;
#else
__host__
__device__
constexpr
DynamicPad
(
const
DynamicPad
&
other
)
:
up_lengths_
{
other
.
up_lengths_
},
left_pad_
{
other
.
left_pad_
},
right_pad_
{
other
.
right_pad_
}
{
...
...
@@ -117,7 +101,6 @@ struct DynamicPad
:
up_lengths_
{
other
.
up_lengths_
},
left_pad_
{
other
.
left_pad_
},
right_pad_
{
other
.
right_pad_
}
{
}
#endif
__host__
__device__
constexpr
DynamicPad
(
const
index_t
&
low_length
,
const
index_t
&
left_pad
,
...
...
@@ -146,32 +129,26 @@ struct DynamicPad
idx_low
(
Number
<
0
>
{})
=
idx_up
[
Number
<
0
>
{}]
-
left_pad_
;
}
template
<
typename
LowIdxDiff
,
typename
UpIdxDiff
,
typename
LowIdx
,
typename
UpIdx
>
__host__
__device__
static
constexpr
void
CalculateLowerIndexDiff
(
LowIdxDiff
&
idx_diff_low
,
const
UpIdxDiff
&
idx_diff_up
,
const
LowIdx
&
/* idx_low_old */
,
const
UpIdx
&
/* idx_up_old */
)
{
static_assert
(
LowIdxDiff
::
Size
()
==
1
&&
UpIdxDiff
::
Size
()
==
1
&&
LowIdx
::
Size
()
==
1
&&
UpIdx
::
Size
()
==
1
,
"wrong! inconsistent # of dimension"
);
idx_diff_low
(
Number
<
0
>
{})
=
idx_diff_up
[
Number
<
0
>
{}];
}
template
<
typename
LowIdxDiff
,
typename
UpIdxDiff
,
typename
LowIdx
,
typename
UpIdx
,
index_t
Hack
>
__host__
__device__
static
void
Calcul
ateLowerIndex
Diff_hack
(
LowIdxDiff
&
idx_diff_low
,
__host__
__device__
static
void
Upd
ateLowerIndex
(
LowIdxDiff
&
idx_diff_low
,
const
UpIdxDiff
&
idx_diff_up
,
const
LowIdx
&
idx_low
_old
,
const
UpIdx
&
idx_up_
old
,
LowIdx
&
idx_low
,
const
UpIdx
&
idx_up_
new
,
Number
<
Hack
>
)
{
CalculateLowerIndexDiff
(
idx_diff_low
,
idx_diff_up
,
idx_low_old
,
idx_up_old
);
static_assert
(
LowIdxDiff
::
Size
()
==
1
&&
UpIdxDiff
::
Size
()
==
1
&&
LowIdx
::
Size
()
==
1
&&
UpIdx
::
Size
()
==
1
,
"wrong! inconsistent # of dimension"
);
constexpr
auto
I0
=
Number
<
0
>
{};
idx_diff_low
(
I0
)
=
idx_diff_up
[
I0
];
idx_low
+=
idx_diff_low
;
}
__host__
__device__
static
constexpr
bool
IsLinearTransform
()
{
return
true
;
}
...
...
@@ -199,11 +176,6 @@ struct DynamicLeftPad
const
UpperIndex
up_lengths_
;
const
index_t
left_pad_
;
#if 0
__host__ __device__ constexpr DynamicLeftPad(const DynamicLeftPad&) = default;
__host__ __device__ constexpr DynamicLeftPad(DynamicLeftPad&&) = default;
#else
__host__
__device__
constexpr
DynamicLeftPad
(
const
DynamicLeftPad
&
other
)
:
up_lengths_
{
other
.
up_lengths_
},
left_pad_
{
other
.
left_pad_
}
{
...
...
@@ -213,7 +185,6 @@ struct DynamicLeftPad
:
up_lengths_
{
other
.
up_lengths_
},
left_pad_
{
other
.
left_pad_
}
{
}
#endif
__host__
__device__
constexpr
DynamicLeftPad
(
const
index_t
&
low_length
,
const
index_t
&
left_pad
)
:
up_lengths_
{
make_multi_index
(
low_length
+
left_pad
)},
left_pad_
{
left_pad
}
...
...
@@ -238,32 +209,26 @@ struct DynamicLeftPad
idx_low
(
Number
<
0
>
{})
=
idx_up
[
Number
<
0
>
{}]
-
left_pad_
;
}
template
<
typename
LowIdxDiff
,
typename
UpIdxDiff
,
typename
LowIdx
,
typename
UpIdx
>
__host__
__device__
static
constexpr
void
CalculateLowerIndexDiff
(
LowIdxDiff
&
idx_diff_low
,
const
UpIdxDiff
&
idx_diff_up
,
const
LowIdx
&
/* idx_low_old */
,
const
UpIdx
&
/* idx_up_old */
)
{
static_assert
(
LowIdxDiff
::
Size
()
==
1
&&
UpIdxDiff
::
Size
()
==
1
&&
LowIdx
::
Size
()
==
1
&&
UpIdx
::
Size
()
==
1
,
"wrong! inconsistent # of dimension"
);
idx_diff_low
(
Number
<
0
>
{})
=
idx_diff_up
[
Number
<
0
>
{}];
}
template
<
typename
LowIdxDiff
,
typename
UpIdxDiff
,
typename
LowIdx
,
typename
UpIdx
,
index_t
Hack
>
__host__
__device__
static
void
Calcul
ateLowerIndex
Diff_hack
(
LowIdxDiff
&
idx_diff_low
,
__host__
__device__
static
void
Upd
ateLowerIndex
(
LowIdxDiff
&
idx_diff_low
,
const
UpIdxDiff
&
idx_diff_up
,
const
LowIdx
&
idx_low
_old
,
const
UpIdx
&
idx_up_
old
,
LowIdx
&
idx_low
,
const
UpIdx
&
idx_up_
new
,
Number
<
Hack
>
)
{
CalculateLowerIndexDiff
(
idx_diff_low
,
idx_diff_up
,
idx_low_old
,
idx_up_old
);
static_assert
(
LowIdxDiff
::
Size
()
==
1
&&
UpIdxDiff
::
Size
()
==
1
&&
LowIdx
::
Size
()
==
1
&&
UpIdx
::
Size
()
==
1
,
"wrong! inconsistent # of dimension"
);
constexpr
auto
I0
=
Number
<
0
>
{};
idx_diff_low
(
I0
)
=
idx_diff_up
[
I0
];
idx_low
+=
idx_diff_low
;
}
__host__
__device__
static
constexpr
bool
IsLinearTransform
()
{
return
true
;
}
...
...
@@ -291,11 +256,6 @@ struct DynamicRightPad
const
index_t
low_length_
;
const
index_t
right_pad_
;
#if 0
__host__ __device__ constexpr DynamicRightPad(const DynamicRightPad&) = default;
__host__ __device__ constexpr DynamicRightPad(DynamicRightPad&&) = default;
#else
__host__
__device__
constexpr
DynamicRightPad
(
const
DynamicRightPad
&
other
)
:
up_lengths_
{
other
.
up_lengths_
},
low_length_
{
other
.
low_length_
},
...
...
@@ -309,7 +269,6 @@ struct DynamicRightPad
right_pad_
{
other
.
right_pad_
}
{
}
#endif
__host__
__device__
constexpr
DynamicRightPad
(
const
index_t
&
low_length
,
const
index_t
&
right_pad
)
...
...
@@ -339,32 +298,26 @@ struct DynamicRightPad
idx_low
(
Number
<
0
>
{})
=
idx_up
[
Number
<
0
>
{}];
}
template
<
typename
LowIdxDiff
,
typename
UpIdxDiff
,
typename
LowIdx
,
typename
UpIdx
>
__host__
__device__
static
constexpr
void
CalculateLowerIndexDiff
(
LowIdxDiff
&
idx_diff_low
,
const
UpIdxDiff
&
idx_diff_up
,
const
LowIdx
&
/* idx_low_old */
,
const
UpIdx
&
/* idx_up_old */
)
{
static_assert
(
LowIdxDiff
::
Size
()
==
1
&&
UpIdxDiff
::
Size
()
==
1
&&
LowIdx
::
Size
()
==
1
&&
UpIdx
::
Size
()
==
1
,
"wrong! inconsistent # of dimension"
);
idx_diff_low
(
Number
<
0
>
{})
=
idx_diff_up
[
Number
<
0
>
{}];
}
template
<
typename
LowIdxDiff
,
typename
UpIdxDiff
,
typename
LowIdx
,
typename
UpIdx
,
index_t
Hack
>
__host__
__device__
static
void
Calcul
ateLowerIndex
Diff_hack
(
LowIdxDiff
&
idx_diff_low
,
__host__
__device__
static
void
Upd
ateLowerIndex
(
LowIdxDiff
&
idx_diff_low
,
const
UpIdxDiff
&
idx_diff_up
,
const
LowIdx
&
idx_low
_old
,
const
UpIdx
&
idx_up_
old
,
LowIdx
&
idx_low
,
const
UpIdx
&
idx_up_
new
,
Number
<
Hack
>
)
{
CalculateLowerIndexDiff
(
idx_diff_low
,
idx_diff_up
,
idx_low_old
,
idx_up_old
);
static_assert
(
LowIdxDiff
::
Size
()
==
1
&&
UpIdxDiff
::
Size
()
==
1
&&
LowIdx
::
Size
()
==
1
&&
UpIdx
::
Size
()
==
1
,
"wrong! inconsistent # of dimension"
);
constexpr
auto
I0
=
Number
<
0
>
{};
idx_diff_low
(
I0
)
=
idx_diff_up
[
I0
];
idx_low
+=
idx_diff_low
;
}
__host__
__device__
static
constexpr
bool
IsLinearTransform
()
{
return
true
;
}
...
...
@@ -392,11 +345,6 @@ struct DynamicEmbed
const
UpperIndex
up_lengths_
;
const
UpperIndex
coefficients_
;
#if 0
__host__ __device__ constexpr DynamicEmbed(const DynamicEmbed&) = default;
__host__ __device__ constexpr DynamicEmbed(DynamicEmbed&&) = default;
#else
__host__
__device__
constexpr
DynamicEmbed
(
const
DynamicEmbed
&
other
)
:
up_lengths_
{
other
.
up_lengths_
},
coefficients_
{
other
.
coefficients_
}
{
...
...
@@ -406,7 +354,7 @@ struct DynamicEmbed
:
up_lengths_
{
other
.
up_lengths_
},
coefficients_
{
other
.
coefficients_
}
{
}
#endif
__host__
__device__
constexpr
DynamicEmbed
(
const
UpperIndex
&
up_lengths
,
const
UpperIndex
&
coefficients
)
:
up_lengths_
{
up_lengths
},
coefficients_
{
coefficients
}
...
...
@@ -447,11 +395,16 @@ struct DynamicEmbed
});
}
template
<
typename
LowIdxDiff
,
typename
UpIdxDiff
,
typename
LowIdx
,
typename
UpIdx
>
__host__
__device__
constexpr
void
CalculateLowerIndexDiff
(
LowIdxDiff
&
idx_diff_low
,
template
<
typename
LowIdxDiff
,
typename
UpIdxDiff
,
typename
LowIdx
,
typename
UpIdx
,
index_t
Hack
>
__host__
__device__
void
UpdateLowerIndex
(
LowIdxDiff
&
idx_diff_low
,
const
UpIdxDiff
&
idx_diff_up
,
const
LowIdx
&
/* idx_low_old */
,
const
UpIdx
&
/* idx_up_old */
)
const
LowIdx
&
idx_low
,
const
UpIdx
&
idx_up_new
,
Number
<
Hack
>
)
const
{
static_assert
(
LowIdxDiff
::
Size
()
==
1
&&
UpIdxDiff
::
Size
()
==
NDimUp
&&
LowIdx
::
Size
()
==
1
&&
UpIdx
::
Size
()
==
NDimUp
,
...
...
@@ -461,20 +414,8 @@ struct DynamicEmbed
static_for
<
0
,
NDimUp
,
1
>
{}(
[
&
](
auto
i
)
{
idx_diff_low
(
Number
<
0
>
{})
+=
idx_diff_up
[
i
]
*
coefficients_
[
i
];
});
}
template
<
typename
LowIdxDiff
,
typename
UpIdxDiff
,
typename
LowIdx
,
typename
UpIdx
,
index_t
Hack
>
__host__
__device__
constexpr
void
CalculateLowerIndexDiff_hack
(
LowIdxDiff
&
idx_diff_low
,
const
UpIdxDiff
&
idx_diff_up
,
const
LowIdx
&
idx_low_old
,
const
UpIdx
&
idx_up_old
,
Number
<
Hack
>
)
const
{
CalculateLowerIndexDiff
(
idx_diff_low
,
idx_diff_up
,
idx_low_old
,
idx_up_old
);
idx_low
+=
idx_diff_low
;
}
__host__
__device__
static
constexpr
bool
IsLinearTransform
()
{
return
true
;
}
...
...
@@ -502,11 +443,6 @@ struct DynamicMerge
const
LowerIndex
low_lengths_scan_
;
const
UpperIndex
up_lengths_
;
#if 0
__host__ __device__ constexpr DynamicMerge(const DynamicMerge&) = default;
__host__ __device__ constexpr DynamicMerge(DynamicMerge&&) = default;
#else
__host__
__device__
constexpr
DynamicMerge
(
const
DynamicMerge
&
other
)
:
low_lengths_
{
other
.
low_lengths_
},
low_lengths_scan_
{
other
.
low_lengths_scan_
},
...
...
@@ -520,7 +456,6 @@ struct DynamicMerge
up_lengths_
{
other
.
up_lengths_
}
{
}
#endif
__host__
__device__
constexpr
DynamicMerge
(
const
LowerIndex
&
low_lengths
)
:
low_lengths_
{
low_lengths
},
...
...
@@ -562,101 +497,15 @@ struct DynamicMerge
idx_low
(
Number
<
NDimLow
-
1
>
{})
=
tmp
;
}
// idx_diff_low depends on idx_low_old, so idx_low need to be up-to-date
// If idx_diff_up is known at compile-time, many calculations can be optimized
// away by compiler
// This function assume idx_low_old is not out-of-bound
template
<
typename
LowIdxDiff
,
typename
UpIdxDiff
,
typename
LowIdx
,
typename
UpIdx
>
__host__
__device__
constexpr
void
CalculateLowerIndexDiff
(
LowIdxDiff
&
idx_diff_low
,
const
UpIdxDiff
&
idx_diff_up
,
const
LowIdx
&
idx_low_old
,
const
UpIdx
&
/* idx_up_old */
)
const
{
static_assert
(
LowIdxDiff
::
Size
()
==
NDimLow
&&
UpIdxDiff
::
Size
()
==
1
&&
LowIdx
::
Size
()
==
NDimLow
&&
UpIdx
::
Size
()
==
1
,
"wrong! inconsistent # of dimension"
);
// CalculateLowerIndex(idx_diff_low_const) has multiple integer divisions.
// However,
// 1) If idx_diff_up is known at compile-time, then idx_diff_low_const
// can be calculated at compile-time.
// 2) If idx_diff_up is not known at compile-time, but its value
// doesn't change during the whole kernel execution, then
// idx_diff_low_const also
// doesn't change during the whole kernel execution. Compiler generated
// ISA should
// only caclculate idx_diff_low_const once and save it durinng the whole
// kernel execution
// If neither 1) nor 2) is satisfied, then the calculation will also be
// computed at
// run-time each time this function is called, and can be very expensive.
LowerIndex
idx_diff_low_const
;
#if !CK_HACK_DYNAMIC_MERGE_CALCULATE_IDX_DIFF_LOW_CONST_USE_AMD_GCN_READ_FIRST_LANE
CalculateLowerIndex
(
idx_diff_low_const
,
idx_diff_up
);
#else
index_t
tmp
=
idx_diff_up
[
Number
<
0
>
{}];
static_for
<
0
,
NDimLow
-
1
,
1
>
{}([
&
](
auto
i
)
{
idx_diff_low_const
(
i
)
=
tmp
/
low_lengths_scan_
[
i
];
tmp
-=
idx_diff_low_const
[
i
]
*
low_lengths_scan_
[
i
];
});
// Hack: this force result into SGPR. Need to make sure the result is thread invariant
idx_diff_low_const
(
Number
<
NDimLow
-
1
>
{})
=
__builtin_amdgcn_readfirstlane
(
tmp
);
#endif
// do carry check on each low dimension in reversed order
// do not need to check the first dimension
index_t
carry
=
0
;
static_for
<
NDimLow
-
1
,
0
,
-
1
>
{}([
&
](
auto
i
)
{
// this should be saved in SGPR as well
index_t
idx_low_length_minus_idx_diff_low_const
=
low_lengths_
[
i
]
-
idx_diff_low_const
[
i
];
#if 1
index_t
idx_low_length_plus_idx_diff_low_const
=
low_lengths_
[
i
]
+
idx_diff_low_const
[
i
];
#endif
index_t
idx_low_tmp
=
idx_low_old
[
i
]
+
carry
;
bool
do_carry
=
idx_low_tmp
>=
idx_low_length_minus_idx_diff_low_const
;
#if 1
bool
do_borrow
=
idx_low_tmp
<
-
idx_diff_low_const
[
i
];
#endif
idx_diff_low
(
i
)
=
do_carry
?
-
idx_low_length_minus_idx_diff_low_const
:
idx_diff_low_const
[
i
];
#if 1
idx_diff_low
(
i
)
=
do_borrow
?
idx_low_length_plus_idx_diff_low_const
:
idx_diff_low
[
i
];
#endif
idx_diff_low
(
i
)
+=
carry
;
carry
=
do_carry
?
1
:
0
;
#if 1
carry
=
do_borrow
?
-
1
:
carry
;
#endif
});
idx_diff_low
(
Number
<
0
>
{})
=
idx_diff_low_const
[
Number
<
0
>
{}]
+
carry
;
}
// idx_diff_low depends on idx_low_old, so idx_low need to be up-to-date
//
// If idx_diff_up is known at compile-time, many calculations can be optimized
// away by compiler
// This function assume idx_low_old is not out-of-bound
// this version save computation but use more register
template
<
typename
LowIdxDiff
,
typename
UpIdxDiff
,
typename
LowIdx
,
typename
UpIdx
,
index_t
Hack
>
__host__
__device__
constexpr
void
Calcul
ateLowerIndex
Diff_hack_1
(
LowIdxDiff
&
idx_diff_low
,
__host__
__device__
void
Upd
ateLowerIndex
(
LowIdxDiff
&
idx_diff_low
,
const
UpIdxDiff
&
idx_diff_up
,
const
LowIdx
&
idx_low
_old
,
const
UpIdx
&
/*
idx_up_
old */
,
LowIdx
&
idx_low
,
const
UpIdx
&
idx_up_
new
,
Number
<
Hack
>
)
const
{
static_assert
(
LowIdxDiff
::
Size
()
==
NDimLow
&&
UpIdxDiff
::
Size
()
==
1
&&
...
...
@@ -675,11 +524,9 @@ struct DynamicMerge
// only caclculate idx_diff_low_const once and save it durinng the whole
// kernel execution
// If neither 1) nor 2) is satisfied, then the calculation will also be
// computed at
//
run-time each time this function is called, and can be
very expensive.
//
computed at
run-time each time this function is called, and can be
// very expensive.
LowerIndex
idx_diff_low_const
;
LowerIndex
idx_low_length_minus_idx_diff_low_const
;
LowerIndex
idx_low_length_plus_idx_diff_low_const
;
#if !CK_HACK_DYNAMIC_MERGE_CALCULATE_IDX_DIFF_LOW_CONST_USE_AMD_GCN_READ_FIRST_LANE
index_t
tmp
=
idx_diff_up
[
Number
<
0
>
{}];
...
...
@@ -690,12 +537,6 @@ struct DynamicMerge
});
idx_diff_low_const
(
Number
<
NDimLow
-
1
>
{})
=
tmp
;
static_for
<
0
,
NDimLow
,
1
>
{}([
&
](
auto
i
)
{
idx_low_length_minus_idx_diff_low_const
(
i
)
=
low_lengths_
[
i
]
-
idx_diff_low_const
[
i
];
idx_low_length_plus_idx_diff_low_const
(
i
)
=
low_lengths_
[
i
]
+
idx_diff_low_const
[
i
];
});
#else
// Hack: this force result into SGPR. Need to make sure the result is thread invariant
index_t
tmp
=
idx_diff_up
[
Number
<
0
>
{}];
...
...
@@ -706,389 +547,70 @@ struct DynamicMerge
});
idx_diff_low_const
(
Number
<
NDimLow
-
1
>
{})
=
__builtin_amdgcn_readfirstlane
(
tmp
);
static_for
<
0
,
NDimLow
,
1
>
{}([
&
](
auto
i
)
{
idx_low_length_minus_idx_diff_low_const
(
i
)
=
__builtin_amdgcn_readfirstlane
(
low_lengths_
[
i
]
-
idx_diff_low_const
[
i
]);
idx_low_length_plus_idx_diff_low_const
(
i
)
=
__builtin_amdgcn_readfirstlane
(
low_lengths_
[
i
]
+
idx_diff_low_const
[
i
]);
});
#endif
if
constexpr
(
Hack
==
1
)
{
// do carry check on each low dimension in reversed order
// do not need to check the first dimension
index_t
carry
=
0
;
static_for
<
NDimLow
-
1
,
0
,
-
1
>
{}([
&
](
auto
i
)
{
index_t
idx_low_tmp
=
idx_low_old
[
i
]
+
carry
;
bool
do_carry
=
idx_low_tmp
>=
idx_low_length_minus_idx_diff_low_const
[
i
];
idx_diff_low
(
i
)
=
do_carry
?
-
idx_low_length_minus_idx_diff_low_const
[
i
]
:
idx_diff_low_const
[
i
];
idx_diff_low
(
i
)
+=
carry
;
carry
=
do_carry
?
1
:
0
;
});
idx_diff_low
(
Number
<
0
>
{})
=
idx_diff_low_const
[
Number
<
0
>
{}]
+
carry
;
}
else
if
constexpr
(
Hack
==
2
)
{
// do carry check on each low dimension in reversed order
// do not need to check the first dimension
index_t
borrow
=
0
;
bool
do_carry
=
0
;
static_for
<
NDimLow
-
1
,
0
,
-
1
>
{}([
&
](
auto
i
)
{
index_t
idx_low_tmp
=
idx_low_old
[
i
]
-
borrow
;
bool
do_borrow
=
idx_low_tmp
<
-
idx_diff_low_const
[
i
];
idx_diff_low
(
i
)
=
do_borrow
?
idx_low_length_plus_idx_diff_low_const
[
i
]
:
idx_diff_low_const
[
i
];
idx_diff_low
(
i
)
=
idx_diff_low_const
[
i
]
+
do_carry
;
i
dx_diff_low
(
i
)
-=
borrow
;
i
ndex_t
idx_low_tmp
=
idx_low
[
i
]
+
idx_diff_low
[
i
]
;
borrow
=
do_borrow
?
1
:
0
;
});
do_carry
=
idx_low_tmp
>=
low_lengths_
[
i
];
idx_diff_low
(
Number
<
0
>
{})
=
idx_diff_low_const
[
Number
<
0
>
{}]
-
borrow
;
}
else
// TODO: use exec-mask inline asm
if
(
do_carry
)
{
// do carry check on each low dimension in reversed order
// do not need to check the first dimension
index_t
carry
=
0
;
static_for
<
NDimLow
-
1
,
0
,
-
1
>
{}([
&
](
auto
i
)
{
index_t
idx_low_tmp
=
idx_low_old
[
i
]
+
carry
;
bool
do_carry
=
idx_low_tmp
>=
idx_low_length_minus_idx_diff_low_const
[
i
];
bool
do_borrow
=
idx_low_tmp
<
-
idx_diff_low_const
[
i
];
idx_diff_low
(
i
)
=
do_carry
?
-
idx_low_length_minus_idx_diff_low_const
[
i
]
:
idx_diff_low_const
[
i
];
idx_diff_low
(
i
)
=
do_borrow
?
idx_low_length_plus_idx_diff_low_const
[
i
]
:
idx_diff_low
[
i
];
idx_diff_low
(
i
)
+=
carry
;
carry
=
do_carry
?
1
:
0
;
carry
=
do_borrow
?
-
1
:
carry
;
});
idx_diff_low
(
Number
<
0
>
{})
=
idx_diff_low_const
[
Number
<
0
>
{}]
+
carry
;
idx_diff_low
(
i
)
-=
low_lengths_
[
i
];
}
}
// idx_diff_low depends on idx_low_old, so idx_low need to be up-to-date
// If idx_diff_up is known at compile-time, many calculations can be optimized
// away by compiler
// This function assume idx_low_old is not out-of-bound
// this version use less register but more computation
template
<
typename
LowIdxDiff
,
typename
UpIdxDiff
,
typename
LowIdx
,
typename
UpIdx
,
index_t
Hack
>
__host__
__device__
constexpr
void
CalculateLowerIndexDiff_hack_2
(
LowIdxDiff
&
idx_diff_low
,
const
UpIdxDiff
&
idx_diff_up
,
const
LowIdx
&
idx_low_old
,
const
UpIdx
&
/* idx_up_old */
,
Number
<
Hack
>
)
const
{
static_assert
(
LowIdxDiff
::
Size
()
==
NDimLow
&&
UpIdxDiff
::
Size
()
==
1
&&
LowIdx
::
Size
()
==
NDimLow
&&
UpIdx
::
Size
()
==
1
,
"wrong! inconsistent # of dimension"
);
// CalculateLowerIndex(idx_diff_low_const) has multiple integer divisions.
// However,
// 1) If idx_diff_up is known at compile-time, then idx_diff_low_const
// can be calculated at compile-time.
// 2) If idx_diff_up is not known at compile-time, but its value
// doesn't change during the whole kernel execution, then
// idx_diff_low_const also
// doesn't change during the whole kernel execution. Compiler generated
// ISA should
// only caclculate idx_diff_low_const once and save it durinng the whole
// kernel execution
// If neither 1) nor 2) is satisfied, then the calculation will also be
// computed at
// run-time each time this function is called, and can be very expensive.
LowerIndex
idx_diff_low_const
;
LowerIndex
idx_low_length_minus_idx_diff_low_const
;
LowerIndex
idx_low_length_plus_idx_diff_low_const
;
#if !CK_HACK_DYNAMIC_MERGE_CALCULATE_IDX_DIFF_LOW_CONST_USE_AMD_GCN_READ_FIRST_LANE
index_t
tmp
=
idx_diff_up
[
Number
<
0
>
{}];
static_for
<
0
,
NDimLow
-
1
,
1
>
{}([
&
](
auto
i
)
{
idx_diff_low_const
(
i
)
=
tmp
/
low_lengths_scan_
[
i
];
tmp
-=
idx_diff_low_const
[
i
]
*
low_lengths_scan_
[
i
];
});
idx_diff_low_const
(
Number
<
NDimLow
-
1
>
{})
=
tmp
;
#else
// Hack: this force result into SGPR. Need to make sure the result is thread invariant
index_t
tmp
=
idx_diff_up
[
Number
<
0
>
{}];
static_for
<
0
,
NDimLow
-
1
,
1
>
{}([
&
](
auto
i
)
{
idx_diff_low_const
(
i
)
=
__builtin_amdgcn_readfirstlane
(
tmp
/
low_lengths_scan_
[
i
]);
tmp
-=
idx_diff_low_const
[
i
]
*
low_lengths_scan_
[
i
];
idx_low
(
i
)
+=
idx_diff_low
[
i
];
});
idx_diff_low_const
(
Number
<
NDimLow
-
1
>
{})
=
__builtin_amdgcn_readfirstlane
(
tmp
);
#endif
if
constexpr
(
Hack
==
1
)
{
// do carry check on each low dimension in reversed order
// do not need to check the first dimension
bool
do_carry
=
0
;
static_for
<
NDimLow
-
1
,
0
,
-
1
>
{}([
&
](
auto
i
)
{
idx_diff_low
(
i
)
=
idx_diff_low_const
[
i
]
+
do_carry
;
index_t
idx_low_tmp
=
idx_low_old
[
i
]
+
idx_diff_low_const
[
i
]
+
do_carry
;
do_carry
=
idx_low_tmp
>=
low_lengths_
[
i
];
constexpr
auto
I0
=
Number
<
0
>
{};
idx_diff_low
(
i
)
=
do_carry
?
idx_diff_low
(
i
)
-
low_lengths_
[
i
]
:
idx_diff_low
[
i
];
});
idx_diff_low
(
I0
)
=
idx_diff_low_const
[
I0
]
+
do_carry
;
idx_
diff_low
(
Number
<
0
>
{}
)
=
idx_diff_low
_const
[
Number
<
0
>
{}]
+
do_carry
;
idx_
low
(
I0
)
+
=
idx_diff_low
[
I0
]
;
}
else
if
constexpr
(
Hack
==
2
)
{
// do
carry
check on each low dimension in reversed order
// do
borrow
check on each low dimension in reversed order
// do not need to check the first dimension
bool
do_borrow
=
0
;
static_for
<
NDimLow
-
1
,
0
,
-
1
>
{}([
&
](
auto
i
)
{
idx_diff_low
(
i
)
=
idx_diff_low_const
[
i
]
-
do_borrow
;
index_t
idx_low_tmp
=
idx_low
_old
[
i
]
+
idx_diff_low
_const
[
i
]
-
do_borrow
;
index_t
idx_low_tmp
=
idx_low
[
i
]
+
idx_diff_low
[
i
]
;
do_borrow
=
idx_low_tmp
<
0
;
idx_diff_low
(
i
)
=
do_borrow
?
idx_diff_low
(
i
)
+
low_lengths_
[
i
]
:
idx_diff_low
[
i
];
});
idx_diff_low
(
Number
<
0
>
{})
=
idx_diff_low_const
[
Number
<
0
>
{}]
-
do_borrow
;
}
else
// TODO: use exec-mask inline asm
if
(
do_borrow
)
{
#if 0
// do carry check on each low dimension in reversed order
// do not need to check the first dimension
index_t carry = 0;
static_for<NDimLow - 1, 0, -1>{}([&](auto i) {
idx_diff_low(i) = idx_diff_low_const[i] + carry;
bool do_carry = idx_low_tmp >= idx_low_length_minus_idx_diff_low_const[i];
bool do_borrow = idx_low_tmp < -idx_diff_low_const[i];
idx_diff_low(i) =
do_carry ? -idx_low_length_minus_idx_diff_low_const[i] : idx_diff_low_const[i];
idx_diff_low(i) =
do_borrow ? idx_low_length_plus_idx_diff_low_const[i] : idx_diff_low[i];
idx_diff_low(i) += carry;
carry = do_carry ? 1 : 0;
carry = do_borrow ? -1 : carry;
});
idx_diff_low(Number<0>{}) = idx_diff_low_const[Number<0>{}] + carry;
#endif
}
idx_diff_low
(
i
)
+=
low_lengths_
[
i
];
}
// idx_diff_low depends on idx_low_old, so idx_low need to be up-to-date
// If idx_diff_up is known at compile-time, many calculations can be optimized
// away by compiler
// This function assume idx_low_old is not out-of-bound
// this version use less register but more computation
template
<
typename
LowIdxDiff
,
typename
UpIdxDiff
,
typename
LowIdx
,
typename
UpIdx
,
index_t
Hack
>
__host__
__device__
constexpr
void
CalculateLowerIndexDiff_hack_3
(
LowIdxDiff
&
idx_diff_low
,
const
UpIdxDiff
&
idx_diff_up
,
const
LowIdx
&
idx_low_old
,
const
UpIdx
&
/* idx_up_old */
,
Number
<
Hack
>
)
const
{
static_assert
(
LowIdxDiff
::
Size
()
==
NDimLow
&&
UpIdxDiff
::
Size
()
==
1
&&
LowIdx
::
Size
()
==
NDimLow
&&
UpIdx
::
Size
()
==
1
,
"wrong! inconsistent # of dimension"
);
// CalculateLowerIndex(idx_diff_low_const) has multiple integer divisions.
// However,
// 1) If idx_diff_up is known at compile-time, then idx_diff_low_const
// can be calculated at compile-time.
// 2) If idx_diff_up is not known at compile-time, but its value
// doesn't change during the whole kernel execution, then
// idx_diff_low_const also
// doesn't change during the whole kernel execution. Compiler generated
// ISA should
// only caclculate idx_diff_low_const once and save it durinng the whole
// kernel execution
// If neither 1) nor 2) is satisfied, then the calculation will also be
// computed at
// run-time each time this function is called, and can be very expensive.
LowerIndex
idx_diff_low_const
;
LowerIndex
idx_low_length_minus_idx_diff_low_const
;
LowerIndex
idx_low_length_plus_idx_diff_low_const
;
#if !CK_HACK_DYNAMIC_MERGE_CALCULATE_IDX_DIFF_LOW_CONST_USE_AMD_GCN_READ_FIRST_LANE
index_t
tmp
=
idx_diff_up
[
Number
<
0
>
{}];
static_for
<
0
,
NDimLow
-
1
,
1
>
{}([
&
](
auto
i
)
{
idx_diff_low_const
(
i
)
=
tmp
/
low_lengths_scan_
[
i
];
tmp
-=
idx_diff_low_const
[
i
]
*
low_lengths_scan_
[
i
];
});
idx_diff_low_const
(
Number
<
NDimLow
-
1
>
{})
=
tmp
;
#else
// Hack: this force result into SGPR. Need to make sure the result is thread invariant
index_t
tmp
=
idx_diff_up
[
Number
<
0
>
{}];
static_for
<
0
,
NDimLow
-
1
,
1
>
{}([
&
](
auto
i
)
{
idx_diff_low_const
(
i
)
=
__builtin_amdgcn_readfirstlane
(
tmp
/
low_lengths_scan_
[
i
]);
tmp
-=
idx_diff_low_const
[
i
]
*
low_lengths_scan_
[
i
];
});
idx_diff_low_const
(
Number
<
NDimLow
-
1
>
{})
=
__builtin_amdgcn_readfirstlane
(
tmp
);
#endif
if
constexpr
(
Hack
==
1
)
{
#if 1
// do carry check on each low dimension in reversed order
// do not need to check the first dimension
bool
do_carry
=
0
;
static_for
<
NDimLow
-
1
,
0
,
-
1
>
{}([
&
](
auto
i
)
{
idx_diff_low
(
i
)
=
idx_diff_low_const
[
i
]
+
do_carry
;
index_t
idx_low_tmp
=
idx_low_old
[
i
]
+
idx_diff_low_const
[
i
]
+
do_carry
;
do_carry
=
idx_low_tmp
>=
low_lengths_
[
i
];
idx_diff_low
(
i
)
=
do_carry
?
idx_diff_low
(
i
)
-
low_lengths_
[
i
]
:
idx_diff_low
[
i
];
});
idx_diff_low
(
Number
<
0
>
{})
=
idx_diff_low_const
[
Number
<
0
>
{}]
+
do_carry
;
#else
LowerIndex
idx_low_new
=
idx_low_old
;
static_for
<
NDimLow
-
1
,
0
,
-
1
>
{}([
&
](
auto
i
)
{
auto
i_m1
=
i
-
Number
<
1
>
{};
int64_t
exec_mask
;
int64_t
do_carry
;
idx_low_new
(
i
)
=
idx_diff_low_const
[
i
]
+
idx_low_old
[
i
];
asm
volatile
(
"
\n
\
s_mov_b64 %0, exec
\n
\
v_cmpx_le_u32_e64 %1, %4, %2
\n
\
v_subrev_u32 %2, %4, %2
\n
\
v_add_u32 %3, %3, 1
\n
\
s_mov_b64 exec, %0
\n
\
"
:
"=s"
(
exec_mask
),
"=s"
(
do_carry
),
"=v"
(
idx_low_new
(
i
)),
"=v"
(
idx_low_new
(
i_m1
))
:
"s"
(
low_lengths_
[
i
]),
"2"
(
idx_low_new
[
i
]),
"3"
(
idx_low_new
[
i_m1
]));
idx_diff_low
(
i
)
=
idx_low_new
[
i
]
-
idx_low_old
[
i
];
idx_low
(
i
)
+=
idx_diff_low
[
i
];
});
constexpr
auto
I0
=
Number
<
0
>
{};
idx_low_new
(
I0
)
+=
idx_diff_low_const
[
I0
];
idx_diff_low
(
I0
)
=
idx_low_new
[
I0
]
-
idx_low_old
[
I0
];
#endif
}
else
if
constexpr
(
Hack
==
2
)
{
// do carry check on each low dimension in reversed order
// do not need to check the first dimension
bool
do_borrow
=
0
;
static_for
<
NDimLow
-
1
,
0
,
-
1
>
{}([
&
](
auto
i
)
{
idx_diff_low
(
i
)
=
idx_diff_low_const
[
i
]
-
do_borrow
;
idx_diff_low
(
I0
)
=
idx_diff_low_const
[
I0
]
-
do_borrow
;
index_t
idx_low_tmp
=
idx_low_old
[
i
]
+
idx_diff_low_const
[
i
]
-
do_borrow
;
do_borrow
=
idx_low_tmp
<
0
;
idx_diff_low
(
i
)
=
do_borrow
?
idx_diff_low
(
i
)
+
low_lengths_
[
i
]
:
idx_diff_low
[
i
];
});
idx_diff_low
(
Number
<
0
>
{})
=
idx_diff_low_const
[
Number
<
0
>
{}]
-
do_borrow
;
idx_low
(
I0
)
+=
idx_diff_low
[
I0
];
}
else
{
#if 0
// do carry check on each low dimension in reversed order
// do not need to check the first dimension
index_t carry = 0;
static_for<NDimLow - 1, 0, -1>{}([&](auto i) {
idx_diff_low(i) = idx_diff_low_const[i] + carry;
bool do_carry = idx_low_tmp >= idx_low_length_minus_idx_diff_low_const[i];
bool do_borrow = idx_low_tmp < -idx_diff_low_const[i];
idx_diff_low(i) =
do_carry ? -idx_low_length_minus_idx_diff_low_const[i] : idx_diff_low_const[i];
idx_diff_low(i) =
do_borrow ? idx_low_length_plus_idx_diff_low_const[i] : idx_diff_low[i];
idx_diff_low(i) += carry;
carry = do_carry ? 1 : 0;
carry = do_borrow ? -1 : carry;
});
idx_diff_low(Number<0>{}) = idx_diff_low_const[Number<0>{}] + carry;
#endif
// not implemented
}
}
template
<
typename
LowIdxDiff
,
typename
UpIdxDiff
,
typename
LowIdx
,
typename
UpIdx
,
index_t
Hack
>
__host__
__device__
constexpr
void
CalculateLowerIndexDiff_hack
(
LowIdxDiff
&
idx_diff_low
,
const
UpIdxDiff
&
idx_diff_up
,
const
LowIdx
&
idx_low_old
,
const
UpIdx
&
idx_up_old
,
Number
<
Hack
>
)
const
{
#if 0
// this version save computation but use more register
CalculateLowerIndexDiff_hack_1(
idx_diff_low, idx_diff_up, idx_low_old, idx_up_old, Number<Hack>{});
#elif
1
// this version use less register but more computation
CalculateLowerIndexDiff_hack_2
(
idx_diff_low
,
idx_diff_up
,
idx_low_old
,
idx_up_old
,
Number
<
Hack
>
{});
#elif 1
// this version use less register but more computation
CalculateLowerIndexDiff_hack_3
(
idx_diff_low
,
idx_diff_up
,
idx_low_old
,
idx_up_old
,
Number
<
Hack
>
{});
#endif
}
__host__
__device__
static
constexpr
bool
IsLinearTransform
()
{
return
false
;
}
__host__
__device__
static
constexpr
bool
IsValidUpperIndexAlwaysMappedToValidLowerIndex
()
...
...
@@ -1156,27 +678,20 @@ struct DynamicUnMerge
}
}
template
<
typename
LowIdxDiff
,
typename
UpIdxDiff
,
typename
LowIdx
,
typename
UpIdx
>
__host__
__device__
constexpr
void
CalculateLowerIndexDiff
(
LowIdxDiff
&
idx_diff_low
,
const
UpIdxDiff
&
idx_diff_up
,
const
LowIdx
&
/* idx_low_old */
,
const
UpIdx
&
/* idx_up_old */
)
const
{
CalculateLowerIndex
(
idx_diff_low
,
idx_diff_up
);
}
template
<
typename
LowIdxDiff
,
typename
UpIdxDiff
,
typename
LowIdx
,
typename
UpIdx
,
index_t
Hack
>
__host__
__device__
constexpr
void
Calcul
ateLowerIndex
Diff_hack
(
LowIdxDiff
&
idx_diff_low
,
__host__
__device__
void
Upd
ateLowerIndex
(
LowIdxDiff
&
idx_diff_low
,
const
UpIdxDiff
&
idx_diff_up
,
const
LowIdx
&
idx_low
_old
,
const
UpIdx
&
idx_up_
old
,
LowIdx
&
idx_low
,
const
UpIdx
&
idx_up_
new
,
Number
<
Hack
>
)
const
{
CalculateLowerIndexDiff
(
idx_diff_low
,
idx_diff_up
,
idx_low_old
,
idx_up_old
);
CalculateLowerIndex
(
idx_diff_low
,
idx_diff_up
);
idx_low
+=
idx_diff_low
;
}
__host__
__device__
static
constexpr
bool
IsLinearTransform
()
{
return
true
;
}
...
...
@@ -1221,139 +736,18 @@ struct DynamicFreeze
idx_low
(
Number
<
0
>
{})
=
low_idx_
;
}
template
<
typename
LowIdxDiff
,
typename
UpIdxDiff
,
typename
LowIdx
,
typename
UpIdx
>
__host__
__device__
static
void
CalculateLowerIndexDiff
(
LowIdxDiff
&
idx_diff_low
,
const
UpIdxDiff
&
idx_diff_up
,
const
LowIdx
&
/* idx_low_old */
,
const
UpIdx
&
/* idx_up_old */
)
{
idx_diff_low
(
Number
<
0
>
{})
=
index_t
{
Number
<
0
>
{}};
}
template
<
typename
LowIdxDiff
,
typename
UpIdxDiff
,
typename
LowIdx
,
typename
UpIdx
,
index_t
Hack
>
__host__
__device__
static
void
Calcul
ateLowerIndex
Diff_hack
(
LowIdxDiff
&
idx_diff_low
,
__host__
__device__
static
void
Upd
ateLowerIndex
(
LowIdxDiff
&
idx_diff_low
,
const
UpIdxDiff
&
idx_diff_up
,
const
LowIdx
&
idx_low
_old
,
const
UpIdx
&
idx_up_
old
,
LowIdx
&
idx_low
,
const
UpIdx
&
idx_up_
new
,
Number
<
Hack
>
)
{
CalculateLowerIndexDiff
(
idx_diff_low
,
idx_diff_up
,
idx_low_old
,
idx_up_old
);
}
__host__
__device__
static
constexpr
bool
IsLinearTransform
()
{
return
true
;
}
__host__
__device__
static
constexpr
bool
IsValidUpperIndexAlwaysMappedToValidLowerIndex
()
{
return
true
;
}
template
<
typename
UpIdx
>
__host__
__device__
static
constexpr
bool
IsValidUpperIndexMappedToValidLowerIndex
(
const
UpIdx
&
/* idx_up */
)
{
return
true
;
}
};
#if 0
template <index_t NDimUp, typename StaticPartialUpLengths>
struct HackSemiDynamicUnMerge
{
using LowerIndex = MultiIndex<1>;
using UpperIndex = MultiIndex<NDimUp>;
const UpperIndex up_lengths_;
const UpperIndex up_lengths_scan_;
static constexpr index_t NDimUpStatic = StaticPartialUpLengths::Size();
static constexpr index_t NDimUpDynamic = NDimUp - NDimUpStatic;
const MultiIndex<NDimUpDynamic> dynamic_partial_up_lengths_;
const MultiIndex<NDimUpDynamic> dynamic_partial_up_lengths_scan_;
static constexpr auto static_partial_up_lengths_ = StaticPartialUpLengths{};
static constexpr auto static_partial_up_lengths_scan_ = reverse_exclusive_scan_sequence(
static_partial_up_lengths_, math::multiplies<index_t>(), Number<1>{});
__host__ __device__ constexpr HackSemiDynamicUnMerge(
const MultiIndex<NDimUpDynamic>& dynamic_partial_up_lengths)
: dynamic_partial_up_lengths_{dynamic_partial_up_lengths},
dynamic_partial_up_lengths_scan_{
container_reverse_exclusive_scan(dynamic_partial_up_lengths,
math::multiplies<index_t>(),
static_partial_up_lengths_scan_[Number<0>{}])}
{
static_assert(NDimUpDynamic + NDimUpStatic == NDimUp,
"wrong! inconsisitent # of dimensions");
}
__host__ __device__ constexpr HackSemiDynamicUnMerge()
: up_lengths_{make_zero_multi_index<NDimUpDynamic>()},
up_lengths_scan_{make_zero_multi_index<NDimUpStatic>()}
{
static_assert(NDimUpDynamic + NDimUpStatic == NDimUp,
"wrong! inconsisitent # of dimensions");
}
__host__ __device__ static constexpr index_t GetNumOfLowerDimension() { return 1; }
__host__ __device__ static constexpr index_t GetNumOfUpperDimension() { return NDimUp; }
__host__ __device__ constexpr const auto GetUpperLengths() const
{
UpperIndex up_lengths;
static_for<0, NDimUpDynamic, 1>{}(
[&](auto i) { up_lengths(i) = dynamic_partial_up_lengths_[i]; });
static_for<0, NDimUpStatic, 1>{}([&](auto i) {
up_lengths(i + Number<NDimUpDynamic>{}) = static_partial_up_lengths_[i];
});
return up_lengths;
}
template <typename LowIdx, typename UpIdx>
__host__ __device__ constexpr void CalculateLowerIndex(LowIdx& idx_low,
const UpIdx& idx_up) const
{
idx_low(Number<0>{}) = idx_up[Number<NDimUp - 1>{}];
static_for<0, NDimUpDynamic, 1>{}([&](auto i) {
idx_low(Number<0>{}) += idx_up[i] * dynamic_partial_up_lengths_scan_[i];
});
static_for<NDimUpDynamic, NDimUp - 1, 1>{}([&](auto i) {
idx_low(Number<0>{}) +=
idx_up[i] * static_partial_up_lengths_scan_[i + Number<NDimUpDynamic>{}];
});
}
template <typename LowIdxDiff, typename UpIdxDiff, typename LowIdx, typename UpIdx>
__host__ __device__ constexpr void CalculateLowerIndexDiff(LowIdxDiff& idx_diff_low,
const UpIdxDiff& idx_diff_up,
const LowIdx& /* idx_low_old */,
const UpIdx& /* idx_up_old */) const
{
CalculateLowerIndex(idx_diff_low, idx_diff_up);
}
template <typename LowIdxDiff,
typename UpIdxDiff,
typename LowIdx,
typename UpIdx,
index_t Hack>
__host__ __device__ constexpr void CalculateLowerIndexDiff_hack(LowIdxDiff& idx_diff_low,
const UpIdxDiff& idx_diff_up,
const LowIdx& idx_low_old,
const UpIdx& idx_up_old,
Number<Hack>) const
{
CalculateLowerIndexDiff(idx_diff_low, idx_diff_up, idx_low_old, idx_up_old);
idx_diff_low
(
Number
<
0
>
{})
=
index_t
{
Number
<
0
>
{}};
}
__host__
__device__
static
constexpr
bool
IsLinearTransform
()
{
return
true
;
}
...
...
@@ -1370,7 +764,6 @@ struct HackSemiDynamicUnMerge
return
true
;
}
};
#endif
}
// namespace ck
#endif
composable_kernel/include/tensor_description/dynamic_tensor_descriptor.hpp
View file @
c1ed17f8
...
...
@@ -502,27 +502,18 @@ __host__ __device__ constexpr void move_dynamic_tensor_coordinate(const TensorDe
constexpr
auto
dims_low
=
TensorDesc
::
GetLowerDimensionIdss
().
At
(
itran
);
constexpr
auto
dims_up
=
TensorDesc
::
GetUpperDimensionIdss
().
At
(
itran
);
const
auto
idx_up
=
get_container_subset
(
idx_hidden
,
dims_up
);
const
auto
idx_up
_new
=
get_container_subset
(
idx_hidden
,
dims_up
);
auto
idx_low
=
get_container_subset
(
idx_hidden
,
dims_low
);
const
auto
idx_diff_up
=
get_container_subset
(
idx_diff_hidden
,
dims_up
);
MultiIndex
<
dims_low
.
Size
()
>
idx_diff_low
;
// calculate idx_diff_low
#if 0 // hack
tran.CalculateLowerIndexDiff(idx_diff_low, idx_diff_up, idx_low, idx_up);
#else
// HACK: control CalculateLowerIndexDiff for DynamicMerge using ing hack
// HACK: control UpdateLowerIndex for DynamicMerge using hack
// TODO remove hack
constexpr
index_t
Hack
=
decltype
(
coord_step
.
hack_calculate_lower_index_diff_
)
::
At
(
itran
);
tran
.
CalculateLowerIndexDiff_hack
(
idx_diff_low
,
idx_diff_up
,
idx_low
,
idx_up
,
Number
<
Hack
>
{});
#endif
// update idx_low
idx_low
+=
idx_diff_low
;
tran
.
UpdateLowerIndex
(
idx_diff_low
,
idx_diff_up
,
idx_low
,
idx_up_new
,
Number
<
Hack
>
{});
set_container_subset
(
idx_diff_hidden
,
dims_low
,
idx_diff_low
);
set_container_subset
(
idx_hidden
,
dims_low
,
idx_low
);
...
...
composable_kernel/include/tensor_operation/gridwise_dynamic_gemm.hpp
View file @
c1ed17f8
...
...
@@ -384,13 +384,8 @@ struct GridwiseDynamicGemm_km_kn_mn_v1
Float
,
decltype
(
c_m0_m1_n0_n1_thread_desc
),
decltype
(
c_m0_m1_n0_n1_global_desc
),
#if 1 // debug
Sequence
<
MRepeat
,
MPerThread
,
NRepeat
,
NPerThread
>
,
CThreadTransferSrcDstAccessOrder
,
#else
Sequence
<
1
,
1
,
2
,
4
>
,
Sequence
<
0
,
1
,
2
,
3
>
,
#endif
CThreadTransferSrcDstVectorDim
,
1
,
CThreadTransferDstScalarPerVector
,
...
...
composable_kernel/include/tensor_operation/threadwise_dynamic_tensor_slice_transfer.hpp
View file @
c1ed17f8
...
...
@@ -922,7 +922,7 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
src_desc
,
make_multi_index
(
1
,
0
),
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
1
,
0
>
{});
const
auto
src_step_m1_0
=
make_dynamic_tensor_coordinate_step_hack
(
src_desc
,
make_multi_index
(
-
1
,
0
),
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
2
,
0
>
{});
#elif
0
#elif
1
// for non-padded input tensor
const
auto
src_step_0_p1
=
make_dynamic_tensor_coordinate_step_hack
(
src_desc
,
make_multi_index
(
0
,
1
),
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
1
>
{});
...
...
@@ -1067,7 +1067,7 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
// for padded input tensor
const
auto
adjusted_step
=
make_dynamic_tensor_coordinate_step_hack
(
src_desc
,
adjusted_step_idx
,
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
1
,
2
>
{});
#elif
0
#elif
1
// for non-paded input tensor
const
auto
adjusted_step
=
make_dynamic_tensor_coordinate_step_hack
(
src_desc
,
adjusted_step_idx
,
Sequence
<
0
,
0
,
0
,
0
,
0
,
1
,
2
>
{});
...
...
composable_kernel/include/utility/amd_buffer_addressing.hpp
View file @
c1ed17f8
...
...
@@ -152,7 +152,7 @@ __device__ float amd_buffer_load<float, 1>(const float* p_src_wave,
return
__llvm_amdgcn_buffer_load_f32
(
src_wave_buffer_resource
.
data
,
0
,
src_addr_shift
+
src_thread_addr_offset
,
false
,
false
);
#else
#if
1
// debug
#if
0
// debug
float tmp = __llvm_amdgcn_buffer_load_f32(
src_wave_buffer_resource.data, 0, src_thread_addr_offset, false, false);
...
...
driver/include/device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
View file @
c1ed17f8
...
...
@@ -201,7 +201,7 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc
printf
(
"%s: BlockSize %u, GridSize %u
\n
"
,
__func__
,
BlockSize
,
GridSize
);
constexpr
auto
conv_driver
=
#if 1
// debug
#if 1
DriverDynamicConvolutionForwardImplicitGemm_v4r4_nchw_kcyx_nkhw_pad
#else
DriverDynamicConvolutionForwardImplicitGemm_v4r4_nchw_kcyx_nkhw_no_pad
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment