Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
3a7fd7d6
Commit
3a7fd7d6
authored
May 21, 2021
by
Chao Liu
Browse files
experimenting magic number division
parent
1928a3d7
Changes
5
Show whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
191 additions
and
3 deletions
+191
-3
composable_kernel/include/tensor_description/dynamic_multi_index_transform.hpp
...lude/tensor_description/dynamic_multi_index_transform.hpp
+159
-0
composable_kernel/include/tensor_description/dynamic_multi_index_transform_helper.hpp
...nsor_description/dynamic_multi_index_transform_helper.hpp
+11
-0
composable_kernel/include/utility/magic_division.hpp
composable_kernel/include/utility/magic_division.hpp
+19
-1
driver/include/device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
...convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
+1
-1
driver/include/device_dynamic_convolution_forward_implicit_gemm_v4r4_nhwc_kyxc_nhwk.hpp
...convolution_forward_implicit_gemm_v4r4_nhwc_kyxc_nhwk.hpp
+1
-1
No files found.
composable_kernel/include/tensor_description/dynamic_multi_index_transform.hpp
View file @
3a7fd7d6
...
@@ -1164,6 +1164,165 @@ struct DynamicMerge_v2_magic_division
...
@@ -1164,6 +1164,165 @@ struct DynamicMerge_v2_magic_division
}
}
};
};
// Implementation of "Merge" transformation primitive that uses magic-number-division to do lowering
// of both multi-index and delta of multi-index
// Caution:
// 1. The magic number division implementation being used would produce correct result if the
// dividended is uint32_t and its value is with in 31-bit value range of uint32_t.
// 2. The magic number division for int32_t dividened has not been implemented, the int32_t
// dividend would be bit-wise interpreted as uint32_t and magic number division implementation for
// uint32_t is then used.
// 3. For Merge primitive, upper-index is the dividend.
// 4. When upper-index is uint32_t, its value need to be within 31-bit range.
// 5. When upper-index is int32_t type (when index_t is int32_t), its value need to be
// non-negative.
template
<
typename
LowLengths
>
struct
DynamicMerge_v2r2_magic_division
{
static
constexpr
index_t
NDimLow
=
LowLengths
::
Size
();
using
LowerIndex
=
MultiIndex
<
NDimLow
>
;
using
UpperIndex
=
MultiIndex
<
1
>
;
using
LowLengthsScan
=
decltype
(
container_reverse_exclusive_scan
(
LowLengths
{},
math
::
multiplies_v2
{},
Number
<
1
>
{}));
using
UpLengths
=
decltype
(
make_tuple
(
container_reduce
(
LowLengths
{},
math
::
multiplies_v2
{},
Number
<
1
>
{})));
using
LowLengthsScanMagicDivisorMultipiler
=
decltype
(
generate_tuple
(
lambda_merge_generate_MagicDivision_calculate_magic_multiplier
<
LowLengthsScan
>
{},
Number
<
NDimLow
>
{}));
using
LowLengthsScanMagicDivisorShift
=
decltype
(
generate_tuple
(
lambda_merge_generate_MagicDivision_calculate_magic_shift
<
LowLengthsScan
>
{},
Number
<
NDimLow
>
{}));
LowLengths
low_lengths_
;
LowLengthsScan
low_lengths_scan_
;
LowLengthsScanMagicDivisorMultipiler
low_lengths_scan_magic_divisor_multiplier_
;
LowLengthsScanMagicDivisorShift
low_lengths_scan_magic_divisor_shift_
;
UpLengths
up_lengths_
;
__host__
__device__
constexpr
DynamicMerge_v2r2_magic_division
()
=
default
;
__host__
__device__
constexpr
DynamicMerge_v2r2_magic_division
(
const
LowLengths
&
low_lengths
)
:
low_lengths_
{
low_lengths
},
low_lengths_scan_
{
container_reverse_exclusive_scan
(
low_lengths
,
math
::
multiplies_v2
{},
Number
<
1
>
{})},
low_lengths_scan_magic_divisor_multiplier_
{
generate_tuple
(
[
&
](
auto
i
)
{
return
MagicDivision
::
CalculateMagicMultiplier
(
low_lengths_scan_
[
i
]);
},
Number
<
NDimLow
>
{})},
low_lengths_scan_magic_divisor_shift_
{
generate_tuple
(
[
&
](
auto
i
)
{
return
MagicDivision
::
CalculateMagicShift
(
low_lengths_scan_
[
i
]);
},
Number
<
NDimLow
>
{})},
up_lengths_
{
make_tuple
(
container_reduce
(
low_lengths
,
math
::
multiplies_v2
{},
Number
<
1
>
{}))}
{
static_assert
(
LowerIndex
::
Size
()
==
NDimLow
,
"wrong!"
);
}
__host__
__device__
static
constexpr
index_t
GetNumOfLowerDimension
()
{
return
NDimLow
;
}
__host__
__device__
static
constexpr
index_t
GetNumOfUpperDimension
()
{
return
1
;
}
__host__
__device__
constexpr
const
auto
&
GetUpperLengths
()
const
{
return
up_lengths_
;
}
template
<
typename
LowIdx
,
typename
UpIdx
>
__host__
__device__
constexpr
void
CalculateLowerIndex
(
LowIdx
&
idx_low
,
const
UpIdx
&
idx_up
)
const
{
static_assert
(
LowIdx
::
Size
()
==
NDimLow
&&
UpIdx
::
Size
()
==
1
,
"wrong! inconsistent # of dimension"
);
index_t
tmp
=
idx_up
[
Number
<
0
>
{}];
static_for
<
0
,
NDimLow
-
1
,
1
>
{}([
&
,
this
](
auto
i
)
{
idx_low
(
i
)
=
MagicDivision
::
DoMagicDivision
(
tmp
,
this
->
low_lengths_scan_magic_divisor_multiplier_
[
i
],
this
->
low_lengths_scan_magic_divisor_shift_
[
i
]);
tmp
-=
idx_low
[
i
]
*
this
->
low_lengths_scan_
[
i
];
});
idx_low
(
Number
<
NDimLow
-
1
>
{})
=
tmp
;
}
template
<
typename
LowIdxDiff
,
typename
UpIdxDiff
,
typename
LowIdx
,
typename
UpIdx
,
index_t
Hack
>
__host__
__device__
void
UpdateLowerIndex
(
LowIdxDiff
&
idx_diff_low
,
const
UpIdxDiff
&
idx_diff_up
,
LowIdx
&
idx_low
,
const
UpIdx
&
idx_up_new
,
Number
<
Hack
>
)
const
{
static_assert
(
LowIdxDiff
::
Size
()
==
NDimLow
&&
UpIdxDiff
::
Size
()
==
1
&&
LowIdx
::
Size
()
==
NDimLow
&&
UpIdx
::
Size
()
==
1
,
"wrong! inconsistent # of dimension"
);
index_t
tmp
=
idx_up_new
[
Number
<
0
>
{}];
static_for
<
0
,
NDimLow
-
1
,
1
>
{}([
&
,
this
](
auto
i
)
{
index_t
idx_low_old
=
idx_low
[
i
];
idx_low
(
i
)
=
MagicDivision
::
DoMagicDivision
(
tmp
,
this
->
low_lengths_scan_magic_divisor_multiplier_
[
i
],
this
->
low_lengths_scan_magic_divisor_shift_
[
i
]);
idx_diff_low
(
i
)
=
idx_low
[
i
]
-
idx_low_old
;
tmp
-=
idx_low
[
i
]
*
this
->
low_lengths_scan_
[
i
];
});
idx_diff_low
(
Number
<
NDimLow
-
1
>
{})
=
tmp
-
idx_low
[
Number
<
NDimLow
-
1
>
{}];
idx_low
(
Number
<
NDimLow
-
1
>
{})
=
tmp
;
}
__host__
__device__
static
constexpr
bool
IsLinearTransform
()
{
return
false
;
}
__host__
__device__
static
constexpr
bool
IsValidUpperIndexAlwaysMappedToValidLowerIndex
()
{
return
true
;
}
__host__
__device__
static
constexpr
bool
IsKnownAtCompileTime
()
{
return
is_known_at_compile_time
<
LowLengths
>::
value
&&
is_known_at_compile_time
<
LowLengthsScanMagicDivisorMultipiler
>::
value
&&
is_known_at_compile_time
<
LowLengthsScanMagicDivisorShift
>::
value
&&
is_known_at_compile_time
<
UpLengths
>::
value
;
}
template
<
typename
UpIdx
>
__host__
__device__
static
constexpr
bool
IsValidUpperIndexMappedToValidLowerIndex
(
const
UpIdx
&
/* idx_up */
)
{
return
true
;
}
__host__
__device__
void
Print
()
const
{
printf
(
"{"
);
printf
(
"DynamicMerge_v2r2_magic_division, "
);
printf
(
"low_lengths_ "
);
print_multi_index
(
low_lengths_
);
printf
(
"low_lengths_scan "
);
print_multi_index
(
low_lengths_scan_
);
printf
(
"low_lengths_scan_magic_divisor_multiplier_ "
);
print_multi_index
(
low_lengths_scan_magic_divisor_multiplier_
);
printf
(
"low_lengths_scan_magic_divisor_shift_ "
);
print_multi_index
(
low_lengths_scan_magic_divisor_shift_
);
printf
(
"up_lengths_ "
);
print_multi_index
(
up_lengths_
);
printf
(
"}"
);
}
};
template
<
typename
UpLengths
,
bool
Use24BitIntegerCalculation
>
template
<
typename
UpLengths
,
bool
Use24BitIntegerCalculation
>
struct
DynamicUnMerge
struct
DynamicUnMerge
{
{
...
...
composable_kernel/include/tensor_description/dynamic_multi_index_transform_helper.hpp
View file @
3a7fd7d6
...
@@ -56,10 +56,21 @@ __host__ __device__ constexpr auto make_merge_transform(const LowLengths& low_le
...
@@ -56,10 +56,21 @@ __host__ __device__ constexpr auto make_merge_transform(const LowLengths& low_le
#if !CK_EXPERIMENTAL_MERGE_USE_MAGIC_DIVISION
#if !CK_EXPERIMENTAL_MERGE_USE_MAGIC_DIVISION
return
DynamicMerge_v1_carry_check
<
LowLengths
>
{
low_lengths
};
return
DynamicMerge_v1_carry_check
<
LowLengths
>
{
low_lengths
};
#else
#else
#if 1
return
DynamicMerge_v2_magic_division
<
LowLengths
>
{
low_lengths
};
return
DynamicMerge_v2_magic_division
<
LowLengths
>
{
low_lengths
};
#else
return
DynamicMerge_v2r2_magic_division
<
LowLengths
>
{
low_lengths
};
#endif
#endif
#endif
}
}
template
<
typename
LowLengths
>
__host__
__device__
constexpr
auto
make_merge_transform_v2_magic_division
(
const
LowLengths
&
low_lengths
)
{
return
DynamicMerge_v2_magic_division
<
LowLengths
>
{
low_lengths
};
}
template
<
typename
UpLengths
,
bool
Use24BitIntegerCalculation
=
false
>
template
<
typename
UpLengths
,
bool
Use24BitIntegerCalculation
=
false
>
__host__
__device__
constexpr
auto
make_unmerge_transform
(
__host__
__device__
constexpr
auto
make_unmerge_transform
(
const
UpLengths
&
up_lengths
,
const
UpLengths
&
up_lengths
,
...
...
composable_kernel/include/utility/magic_division.hpp
View file @
3a7fd7d6
...
@@ -118,6 +118,7 @@ struct MagicDivision
...
@@ -118,6 +118,7 @@ struct MagicDivision
return
(
tmp
+
dividend
)
>>
shift
;
return
(
tmp
+
dividend
)
>>
shift
;
}
}
#if 1 // debug
// HACK: magic division for int32_t
// HACK: magic division for int32_t
// HACK: use dividend_i32 as if it's uint32_t, dividend_i32 need to be
// HACK: use dividend_i32 as if it's uint32_t, dividend_i32 need to be
// non-negative for result to be correct
// non-negative for result to be correct
...
@@ -127,8 +128,25 @@ struct MagicDivision
...
@@ -127,8 +128,25 @@ struct MagicDivision
{
{
uint32_t
dividend_u32
=
as_type
<
uint32_t
>
(
dividend_i32
);
uint32_t
dividend_u32
=
as_type
<
uint32_t
>
(
dividend_i32
);
uint32_t
tmp
=
((
uint64_t
)
dividend_u32
*
(
uint64_t
)
multiplier
)
>>
32
;
uint32_t
tmp
=
((
uint64_t
)
dividend_u32
*
(
uint64_t
)
multiplier
)
>>
32
;
return
(
tmp
+
dividend_
i
32
)
>>
shift
;
return
(
tmp
+
dividend_
u
32
)
>>
shift
;
}
}
#else
// the inline ASM is producing wrong result
__host__
__device__
static
int32_t
DoMagicDivision
(
int32_t
dividend_i32
,
uint32_t
multiplier
,
uint32_t
shift
)
{
uint32_t
r
;
asm
volatile
(
"
\n
\
v_mul_hi_u32 %0, %1, %2
\n
\
v_add_u32_e32 %0, %1, %0
\n
\
v_lshrrev_b32_e32 %0, %3, %0
\n
\
"
:
"=v"
(
r
)
:
"v"
(
as_type
<
uint32_t
>
(
dividend_i32
)),
"s"
(
multiplier
),
"s"
(
shift
));
return
as_type
<
int32_t
>
(
r
);
}
#endif
};
};
}
// namespace ck
}
// namespace ck
...
...
driver/include/device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
View file @
3a7fd7d6
...
@@ -475,7 +475,7 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw(
...
@@ -475,7 +475,7 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw(
const
auto
descs
=
const
auto
descs
=
#if 1
#if 1
transform_forward_convolution_into_gemm_v4r4_nchw_kcyx_nkhw_pad
transform_forward_convolution_into_gemm_v4r4_nchw_kcyx_nkhw_pad
#elif
0
#elif
1
transform_forward_convolution_into_gemm_v4r4_nchw_kcyx_nkhw_no_pad
transform_forward_convolution_into_gemm_v4r4_nchw_kcyx_nkhw_no_pad
#else
#else
transform_forward_convolution_into_gemm_v4r4_nchw_kcyx_nkhw_1x1
transform_forward_convolution_into_gemm_v4r4_nchw_kcyx_nkhw_1x1
...
...
driver/include/device_dynamic_convolution_forward_implicit_gemm_v4r4_nhwc_kyxc_nhwk.hpp
View file @
3a7fd7d6
...
@@ -117,7 +117,7 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4_nhwc_kyxc_nhwk(
...
@@ -117,7 +117,7 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4_nhwc_kyxc_nhwk(
wei_k_y_x_c_device_buf
.
ToDevice
(
wei_k_y_x_c
.
mData
.
data
());
wei_k_y_x_c_device_buf
.
ToDevice
(
wei_k_y_x_c
.
mData
.
data
());
out_n_ho_wo_k_device_buf
.
ToDevice
(
out_n_ho_wo_k
.
mData
.
data
());
out_n_ho_wo_k_device_buf
.
ToDevice
(
out_n_ho_wo_k
.
mData
.
data
());
#if
0
#if
1
// cdata = 16, BlockSize = 64, 16x64x4
// cdata = 16, BlockSize = 64, 16x64x4
constexpr
index_t
BlockSize
=
64
;
constexpr
index_t
BlockSize
=
64
;
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment