Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
2fc3888d
Commit
2fc3888d
authored
Mar 31, 2021
by
Chao Liu
Browse files
adding magic number division
parent
fcbb9788
Changes
5
Show whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
350 additions
and
7 deletions
+350
-7
composable_kernel/include/tensor_description/dynamic_multi_index_transform.hpp
...lude/tensor_description/dynamic_multi_index_transform.hpp
+229
-1
composable_kernel/include/utility/common_header.hpp
composable_kernel/include/utility/common_header.hpp
+1
-0
composable_kernel/include/utility/magic_division.hpp
composable_kernel/include/utility/magic_division.hpp
+114
-0
driver/include/device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
...convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
+2
-2
driver/src/conv_driver.cpp
driver/src/conv_driver.cpp
+4
-4
No files found.
composable_kernel/include/tensor_description/dynamic_multi_index_transform.hpp
View file @
2fc3888d
...
...
@@ -467,6 +467,27 @@ struct DynamicEmbed
}
};
#if 1
template
<
typename
LowLengths
>
struct
lambda_merge_generate_magic_division_calculate_magic_multiplier
{
template
<
index_t
I
>
__host__
__device__
constexpr
auto
operator
()(
Number
<
I
>
i
)
const
{
return
magic_division
::
CalculateMagicMultiplier
(
LowLengths
{}[
i
]);
}
};
template
<
typename
LowLengths
>
struct
lambda_merge_generate_magic_division_calculate_magic_shift
{
template
<
index_t
I
>
__host__
__device__
constexpr
auto
operator
()(
Number
<
I
>
i
)
const
{
return
magic_division
::
CalculateMagicShift
(
LowLengths
{}[
i
]);
}
};
template
<
typename
LowLengths
>
struct
DynamicMerge
{
...
...
@@ -478,11 +499,21 @@ struct DynamicMerge
using
LowLengthsScan
=
decltype
(
container_reverse_exclusive_scan
(
LowLengths
{},
math
::
multiplies_v2
{},
Number
<
1
>
{}));
using
LowLengthsMagicDivisorMultipiler
=
decltype
(
generate_tuple
(
lambda_merge_generate_magic_division_calculate_magic_multiplier
<
LowLengths
>
{},
Number
<
NDimLow
>
{}));
using
LowLengthsMagicDivisorShift
=
decltype
(
generate_tuple
(
lambda_merge_generate_magic_division_calculate_magic_shift
<
LowLengths
>
{},
Number
<
NDimLow
>
{}));
using
UpLengths
=
decltype
(
make_tuple
(
container_reduce
(
LowLengths
{},
math
::
multiplies_v2
{},
Number
<
1
>
{})));
LowLengths
low_lengths_
;
LowLengthsScan
low_lengths_scan_
;
LowLengthsMagicDivisorMultipiler
low_lengths_magic_divisor_multiplier_
;
LowLengthsMagicDivisorShift
low_lengths_magic_divisor_shift_
;
UpLengths
up_lengths_
;
__host__
__device__
constexpr
DynamicMerge
()
=
default
;
...
...
@@ -491,6 +522,12 @@ struct DynamicMerge
:
low_lengths_
{
low_lengths
},
low_lengths_scan_
{
container_reverse_exclusive_scan
(
low_lengths
,
math
::
multiplies_v2
{},
Number
<
1
>
{})},
low_lengths_magic_divisor_multiplier_
{
generate_tuple
(
[
&
](
auto
i
)
{
return
magic_division
::
CalculateMagicMultiplier
(
low_lengths
[
i
]);
},
Number
<
NDimLow
>
{})},
low_lengths_magic_divisor_shift_
{
generate_tuple
(
[
&
](
auto
i
)
{
return
magic_division
::
CalculateMagicShift
(
low_lengths
[
i
]);
},
Number
<
NDimLow
>
{})},
up_lengths_
{
make_tuple
(
container_reduce
(
low_lengths
,
math
::
multiplies_v2
{},
Number
<
1
>
{}))}
{
static_assert
(
LowerIndex
::
Size
()
==
NDimLow
,
"wrong!"
);
...
...
@@ -511,12 +548,27 @@ struct DynamicMerge
index_t
tmp
=
idx_up
[
Number
<
0
>
{}];
static_for
<
0
,
NDimLow
-
1
,
1
>
{}([
&
idx_low
,
&
tmp
,
this
](
auto
i
)
{
#if 0
// normal division
static_for<0, NDimLow - 1, 1>{}([&](auto i) {
idx_low(i) = tmp / this->low_lengths_scan_[i];
tmp -= idx_low[i] * this->low_lengths_scan_[i];
});
idx_low(Number<NDimLow - 1>{}) = tmp;
#else
// magic division
static_for
<
NDimLow
-
1
,
0
,
-
1
>
{}([
&
](
auto
i
)
{
index_t
tmp2
=
magic_division
::
DoMagicDivision
(
tmp
,
this
->
low_lengths_magic_divisor_multiplier_
[
i
],
this
->
low_lengths_magic_divisor_shift_
[
i
]);
idx_low
(
i
)
=
tmp
-
tmp2
*
this
->
low_lengths_
[
i
];
tmp
=
tmp2
;
});
idx_low
(
Number
<
0
>
{})
=
tmp
;
#endif
}
template
<
typename
LowIdxDiff
,
...
...
@@ -555,12 +607,27 @@ struct DynamicMerge
#if !CK_HACK_DYNAMIC_MERGE_CALCULATE_IDX_DIFF_LOW_CONST_USE_AMD_GCN_READ_FIRST_LANE
index_t
tmp
=
idx_diff_up
[
Number
<
0
>
{}];
#if 1
// normal division
static_for
<
0
,
NDimLow
-
1
,
1
>
{}([
&
](
auto
i
)
{
idx_diff_low_const
(
i
)
=
tmp
/
low_lengths_scan_
[
i
];
tmp
-=
idx_diff_low_const
[
i
]
*
low_lengths_scan_
[
i
];
});
idx_diff_low_const
(
Number
<
NDimLow
-
1
>
{})
=
tmp
;
#else
// magic division
static_for
<
NDimLow
-
1
,
0
,
-
1
>
{}([
&
](
auto
i
)
{
index_t
tmp2
=
magic_division
::
DoMagicDivision
(
tmp
,
this
->
low_lengths_magic_divisor_multiplier_
[
i
],
this
->
low_lengths_magic_divisor_shift_
[
i
]);
idx_diff_low_const
(
i
)
=
tmp
-
tmp2
*
this
->
low_lengths_
[
i
];
tmp
=
tmp2
;
});
idx_diff_low_const
(
Number
<
0
>
{})
=
tmp
;
#endif
static_for
<
0
,
NDimLow
,
1
>
{}([
&
](
auto
i
)
{
idx_low_length_minus_idx_diff_low_const
(
i
)
=
low_lengths_
[
i
]
-
idx_diff_low_const
[
i
];
...
...
@@ -571,10 +638,25 @@ struct DynamicMerge
// Hack: this force result into SGPR. Need to make sure the result is thread invariant
index_t
tmp
=
idx_diff_up
[
Number
<
0
>
{}];
#if 1
// normal division
static_for
<
0
,
NDimLow
-
1
,
1
>
{}([
&
](
auto
i
)
{
idx_diff_low_const
(
i
)
=
__builtin_amdgcn_readfirstlane
(
tmp
/
low_lengths_scan_
[
i
]);
tmp
-=
idx_diff_low_const
[
i
]
*
low_lengths_scan_
[
i
];
});
#else
// magic division
static_for
<
NDimLow
-
1
,
0
,
-
1
>
{}([
&
](
auto
i
)
{
index_t
tmp2
=
magic_division
::
DoMagicDivision
(
tmp
,
this
->
low_lengths_magic_divisor_multiplier_
[
i
],
this
->
low_lengths_magic_divisor_shift_
[
i
]);
idx_diff_low_const
(
i
)
=
__builtin_amdgcn_readfirstlane
(
tmp
-
tmp2
*
this
->
low_lengths_
[
i
]);
tmp
=
tmp2
;
});
#endif
idx_diff_low_const
(
Number
<
NDimLow
-
1
>
{})
=
__builtin_amdgcn_readfirstlane
(
tmp
);
...
...
@@ -988,6 +1070,152 @@ struct DynamicMerge
printf
(
"}"
);
}
};
#else
template
<
typename
LowLengths
>
struct
lambda_generate_magic_division_calculate_magic_multiplier
{
template
<
index_t
I
>
__host__
__device__
constexpr
auto
operator
()(
Number
<
I
>
i
)
const
{
return
magic_division
::
CalculateMagicMultiplier
(
LowLengths
{}[
i
]);
}
};
template
<
typename
LowLengths
>
struct
lambda_generate_magic_division_calculate_magic_shift
{
template
<
index_t
I
>
__host__
__device__
constexpr
auto
operator
()(
Number
<
I
>
i
)
const
{
return
magic_division
::
CalculateMagicShift
(
LowLengths
{}[
i
]);
}
};
template
<
typename
LowLengths
>
struct
DynamicMerge
{
static
constexpr
index_t
NDimLow
=
LowLengths
::
Size
();
using
LowerIndex
=
MultiIndex
<
NDimLow
>
;
using
UpperIndex
=
MultiIndex
<
1
>
;
using
UpLengths
=
decltype
(
make_tuple
(
container_reduce
(
LowLengths
{},
math
::
multiplies_v2
{},
Number
<
1
>
{})));
using
LowLengthsMagicDivisorMultipiler
=
decltype
(
generate_tuple
(
lambda_generate_magic_division_calculate_magic_multiplier
<
LowLengths
>
{},
Number
<
NDimLow
>
{}));
using
LowLengthsMagicDivisorShift
=
decltype
(
generate_tuple
(
lambda_generate_magic_division_calculate_magic_shift
<
LowLengths
>
{},
Number
<
NDimLow
>
{}));
LowLengths
low_lengths_
;
LowLengthsMagicDivisorMultipiler
low_lengths_magic_divisor_multiplier_
;
LowLengthsMagicDivisorShift
low_lengths_magic_divisor_shift_
;
UpLengths
up_lengths_
;
__host__
__device__
constexpr
DynamicMerge
()
=
default
;
__host__
__device__
constexpr
DynamicMerge
(
const
LowLengths
&
low_lengths
)
:
low_lengths_
{
low_lengths
},
low_lengths_magic_divisor_multiplier_
{
generate_tuple
(
[
&
](
auto
i
)
{
return
magic_division
::
CalculateMagicMultiplier
(
low_lengths
[
i
]);
},
Number
<
NDimLow
>
{})},
low_lengths_magic_divisor_shift_
{
generate_tuple
(
[
&
](
auto
i
)
{
return
magic_division
::
CalculateMagicShift
(
low_lengths
[
i
]);
},
Number
<
NDimLow
>
{})},
up_lengths_
{
make_tuple
(
container_reduce
(
low_lengths
,
math
::
multiplies_v2
{},
Number
<
1
>
{}))}
{
static_assert
(
LowerIndex
::
Size
()
==
NDimLow
,
"wrong!"
);
}
__host__
__device__
static
constexpr
index_t
GetNumOfLowerDimension
()
{
return
NDimLow
;
}
__host__
__device__
static
constexpr
index_t
GetNumOfUpperDimension
()
{
return
1
;
}
__host__
__device__
constexpr
const
auto
&
GetUpperLengths
()
const
{
return
up_lengths_
;
}
template
<
typename
LowIdx
,
typename
UpIdx
>
__host__
__device__
constexpr
void
CalculateLowerIndex
(
LowIdx
&
idx_low
,
const
UpIdx
&
idx_up
)
const
{
static_assert
(
LowIdx
::
Size
()
==
NDimLow
&&
UpIdx
::
Size
()
==
1
,
"wrong! inconsistent # of dimension"
);
index_t
tmp
=
idx_up
[
Number
<
0
>
{}];
static_for
<
NDimLow
-
1
,
0
,
-
1
>
{}([
&
idx_low
,
&
tmp
,
this
](
auto
i
)
{
index_t
tmp2
=
magic_division
::
DoMagicDivision
(
tmp
,
this
->
low_lengths_magic_divisor_multiplier_
[
i
],
this
->
low_lengths_magic_divisor_shift_
[
i
]);
idx_low
(
i
)
=
tmp
-
tmp2
*
this
->
low_lengths_
[
i
];
tmp
=
tmp2
;
});
idx_low
(
Number
<
0
>
{})
=
tmp
;
}
template
<
typename
LowIdxDiff
,
typename
UpIdxDiff
,
typename
LowIdx
,
typename
UpIdx
,
index_t
Hack
>
__host__
__device__
void
UpdateLowerIndex
(
LowIdxDiff
&
idx_diff_low
,
const
UpIdxDiff
&
idx_diff_up
,
LowIdx
&
idx_low
,
const
UpIdx
&
idx_up_new
,
Number
<
Hack
>
)
const
{
static_assert
(
LowIdxDiff
::
Size
()
==
NDimLow
&&
UpIdxDiff
::
Size
()
==
1
&&
LowIdx
::
Size
()
==
NDimLow
&&
UpIdx
::
Size
()
==
1
,
"wrong! inconsistent # of dimension"
);
auto
idx_low_old
=
idx_low
;
CalculateLowerIndex
(
idx_low
,
idx_up_new
);
idx_diff_low
=
idx_low
-
idx_low_old
;
}
__host__
__device__
static
constexpr
bool
IsLinearTransform
()
{
return
false
;
}
__host__
__device__
static
constexpr
bool
IsValidUpperIndexAlwaysMappedToValidLowerIndex
()
{
return
true
;
}
__host__
__device__
static
constexpr
bool
IsKnownAtCompileTime
()
{
return
is_known_at_compile_time
<
LowLengths
>::
value
&&
is_known_at_compile_time
<
LowLengthsMagicDivisorMultipiler
>::
value
&&
is_known_at_compile_time
<
LowLengthsMagicDivisorShift
>::
value
&&
is_known_at_compile_time
<
UpLengths
>::
value
;
}
template
<
typename
UpIdx
>
__host__
__device__
static
constexpr
bool
IsValidUpperIndexMappedToValidLowerIndex
(
const
UpIdx
&
/* idx_up */
)
{
return
true
;
}
__host__
__device__
void
Print
()
const
{
printf
(
"{"
);
printf
(
"DynamicMerge, "
);
printf
(
"low_lengths_ "
);
print_multi_index
(
low_lengths_
);
printf
(
"low_lengths_magic_divisor_multiplier_ "
);
print_multi_index
(
low_lengths_magic_divisor_multiplier_
);
printf
(
"low_lengths_magic_divisor_shift_ "
);
print_multi_index
(
low_lengths_magic_divisor_shift_
);
printf
(
"up_lengths_ "
);
print_multi_index
(
up_lengths_
);
printf
(
"}"
);
}
};
#endif
template
<
typename
UpLengths
,
bool
Use24BitIntegerCalculation
>
struct
DynamicUnMerge
...
...
composable_kernel/include/utility/common_header.hpp
View file @
2fc3888d
...
...
@@ -22,6 +22,7 @@
#include "tuple_helper.hpp"
#include "type.hpp"
#include "utility.hpp"
#include "magic_division.hpp"
#if CK_USE_AMD_INLINE_ASM
#include "amd_inline_asm.hpp"
...
...
composable_kernel/include/utility/magic_division.hpp
0 → 100644
View file @
2fc3888d
#ifndef CK_MAGIC_DIVISION_HPP
#define CK_MAGIC_DIVISION_HPP
#include "config.hpp"
#include "integral_constant.hpp"
#include "number.hpp"
#include "type.hpp"
#include "tuple.hpp"
namespace
ck
{
// magic number division
struct
magic_division
{
// uint32_t
__host__
__device__
static
constexpr
auto
CalculateMagicNumbers
(
uint32_t
divisor
)
{
// assert(divisior >= 1 && divisior <= INT32_MAX);
uint32_t
shift
=
0
;
for
(
shift
=
0
;
shift
<
32
;
++
shift
)
{
if
((
1U
<<
shift
)
>=
divisor
)
{
break
;
}
}
uint64_t
one
=
1
;
uint64_t
multiplier
=
((
one
<<
32
)
*
((
one
<<
shift
)
-
divisor
))
/
divisor
+
1
;
// assert(multiplier <= 0xffffffffUL);
return
make_tuple
(
uint32_t
(
multiplier
),
shift
);
}
__host__
__device__
static
constexpr
uint32_t
CalculateMagicMultiplier
(
uint32_t
divisor
)
{
auto
tmp
=
CalculateMagicNumbers
(
divisor
);
return
tmp
[
Number
<
0
>
{}];
}
__host__
__device__
static
constexpr
uint32_t
CalculateMagicShift
(
uint32_t
divisor
)
{
auto
tmp
=
CalculateMagicNumbers
(
divisor
);
return
tmp
[
Number
<
1
>
{}];
}
// integral_constant<uint32_t, .>
template
<
uint32_t
Divisor
>
__host__
__device__
static
constexpr
auto
CalculateMagicNumbers
(
integral_constant
<
uint32_t
,
Divisor
>
)
{
constexpr
auto
tmp
=
CalculateMagicNumbers
(
uint32_t
{
Divisor
});
constexpr
uint32_t
multiplier
=
tmp
[
Number
<
0
>
{}];
constexpr
uint32_t
shift
=
tmp
[
Number
<
1
>
{}];
return
make_tuple
(
integral_constant
<
uint32_t
,
multiplier
>
{},
integral_constant
<
uint32_t
,
shift
>
{});
}
template
<
uint32_t
Divisor
>
__host__
__device__
static
constexpr
auto
CalculateMagicMultiplier
(
integral_constant
<
uint32_t
,
Divisor
>
)
{
constexpr
uint32_t
multiplier
=
CalculateMagicMultiplier
(
uint32_t
{
Divisor
});
return
integral_constant
<
uint32_t
,
multiplier
>
{};
}
template
<
uint32_t
Divisor
>
__host__
__device__
static
constexpr
auto
CalculateMagicShift
(
integral_constant
<
uint32_t
,
Divisor
>
)
{
constexpr
uint32_t
shift
=
CalculateMagicShift
(
uint32_t
{
Divisor
});
return
integral_constant
<
uint32_t
,
shift
>
{};
}
// integral_constant<int32_t, .>
template
<
int32_t
Divisor
>
__host__
__device__
static
constexpr
auto
CalculateMagicNumbers
(
integral_constant
<
int32_t
,
Divisor
>
)
{
return
CalculateMagicNumbers
(
integral_constant
<
uint32_t
,
Divisor
>
{});
}
template
<
int32_t
Divisor
>
__host__
__device__
static
constexpr
auto
CalculateMagicMultiplier
(
integral_constant
<
int32_t
,
Divisor
>
)
{
return
CalculateMagicMultiplier
(
integral_constant
<
uint32_t
,
Divisor
>
{});
}
template
<
int32_t
Divisor
>
__host__
__device__
static
constexpr
auto
CalculateMagicShift
(
integral_constant
<
int32_t
,
Divisor
>
)
{
return
CalculateMagicShift
(
integral_constant
<
uint32_t
,
Divisor
>
{});
}
// magic division
__host__
__device__
static
constexpr
uint32_t
DoMagicDivision
(
uint32_t
dividend
,
uint32_t
multiplier
,
uint32_t
shift
)
{
uint32_t
tmp
=
(
uint64_t
(
dividend
)
*
uint64_t
(
multiplier
))
>>
32
;
return
(
tmp
+
dividend
)
>>
shift
;
}
};
}
// namespace ck
#endif
driver/include/device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
View file @
2fc3888d
...
...
@@ -40,7 +40,7 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw(
wei_k_c_y_x_device_buf
.
ToDevice
(
wei_k_c_y_x
.
mData
.
data
());
out_n_k_ho_wo_device_buf
.
ToDevice
(
out_n_k_ho_wo
.
mData
.
data
());
#if
0
#if
1
// run-time variables
const
auto
in_n_c_hi_wi_desc
=
make_dynamic_naive_tensor_descriptor_packed_v2
(
to_multi_index
(
InDesc
::
GetLengths
()));
...
...
@@ -167,7 +167,7 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw(
constexpr
index_t
GemmBBlockTransferDstScalarPerVector_GemmN
=
1
;
constexpr
index_t
GemmCThreadTransferDstScalarPerVector_GemmN1
=
4
;
#elif
1
#elif
0
// cdata = 64, BlockSize 64, 16x256x4
constexpr
index_t
BlockSize
=
64
;
...
...
driver/src/conv_driver.cpp
View file @
2fc3888d
...
...
@@ -64,7 +64,7 @@ int main(int argc, char* argv[])
using
LeftPads
=
Sequence
<
0
,
0
>
;
using
RightPads
=
Sequence
<
0
,
0
>
;
#elif
1
#elif
0
constexpr
index_t
N
=
1
;
constexpr
index_t
C
=
16
;
constexpr
index_t
HI
=
1080
;
...
...
@@ -150,7 +150,7 @@ int main(int argc, char* argv[])
using
LeftPads
=
Sequence
<
0
,
0
>
;
using
RightPads
=
Sequence
<
0
,
0
>
;
#elif
0
#elif
1
// 3x3, 71x71
constexpr
index_t
N
=
128
;
constexpr
index_t
C
=
192
;
...
...
@@ -630,7 +630,7 @@ int main(int argc, char* argv[])
print_array
(
"ConvStrides"
,
to_multi_index
(
ConvStrides
{}));
print_array
(
"ConvDilations"
,
to_multi_index
(
ConvDilations
{}));
#if
0
#if
1
using
in_data_t
=
float
;
constexpr
index_t
in_vector_size
=
1
;
using
acc_data_t
=
float
;
...
...
@@ -724,7 +724,7 @@ int main(int argc, char* argv[])
LeftPads
{},
RightPads
{},
nrepeat
);
#elif
0
#elif
1
device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw
<
in_data_t
,
in_vector_size
,
acc_data_t
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment