Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
yangql
composable_kernel-1
Commits
b2439ec9
"git@developer.sourcefind.cn:gaoqiong/composable_kernel.git" did not exist on "899df9710af70ea2fd0c249d8b6f6f72fa972e1b"
Commit
b2439ec9
authored
May 30, 2019
by
Chao Liu
Browse files
adding implicit gemm v4 (nchw, kcyx)
parent
0a265731
Changes
11
Hide whitespace changes
Inline
Side-by-side
Showing
11 changed files
with
445 additions
and
135 deletions
+445
-135
driver/driver.hip.cpp
driver/driver.hip.cpp
+15
-9
src/include/Array.hip.hpp
src/include/Array.hip.hpp
+158
-4
src/include/ConstantMergedTensorDescriptor.hip.hpp
src/include/ConstantMergedTensorDescriptor.hip.hpp
+1
-41
src/include/ConstantTensorDescriptor.hip.hpp
src/include/ConstantTensorDescriptor.hip.hpp
+85
-33
src/include/Sequence.hip.hpp
src/include/Sequence.hip.hpp
+36
-0
src/include/blockwise_3d_tensor_op.hip.hpp
src/include/blockwise_3d_tensor_op.hip.hpp
+1
-1
src/include/blockwise_4d_tensor_op.hip.hpp
src/include/blockwise_4d_tensor_op.hip.hpp
+1
-1
src/include/blockwise_generic_tensor_slice_op.hip.hpp
src/include/blockwise_generic_tensor_slice_op.hip.hpp
+28
-15
src/include/common.hip.hpp
src/include/common.hip.hpp
+12
-0
src/include/gridwise_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hip.hpp
...dwise_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hip.hpp
+65
-24
src/include/gridwise_convolution_implicit_gemm_v4_nchw_kcyx_nkhw.hip.hpp
...dwise_convolution_implicit_gemm_v4_nchw_kcyx_nkhw.hip.hpp
+43
-7
No files found.
driver/driver.hip.cpp
View file @
b2439ec9
...
...
@@ -44,11 +44,7 @@ struct GeneratorTensor_3
{
std
::
array
<
index_t
,
sizeof
...(
Is
)
>
dims
=
{{
static_cast
<
index_t
>
(
is
)...}};
#if 0
auto f_acc = std::plus<index_t>{};
#else
auto
f_acc
=
[](
auto
a
,
auto
b
)
{
return
100
*
a
+
b
;
};
#endif
return
std
::
accumulate
(
dims
.
begin
(),
dims
.
end
(),
index_t
(
0
),
f_acc
);
}
...
...
@@ -447,7 +443,7 @@ int main(int argc, char* argv[])
constexpr
index_t
HPad
=
0
;
constexpr
index_t
WPad
=
0
;
#elif
1
#elif
0
// 3x3 filter, 28x28 image
constexpr
index_t
N
=
128
;
constexpr
index_t
C
=
256
;
...
...
@@ -543,7 +539,7 @@ int main(int argc, char* argv[])
constexpr
index_t
HPad
=
0
;
constexpr
index_t
WPad
=
0
;
#elif
0
#elif
1
// 1x1 filter, 14x14 image
constexpr
index_t
N
=
128
;
constexpr
index_t
C
=
512
;
...
...
@@ -553,6 +549,18 @@ int main(int argc, char* argv[])
constexpr
index_t
Y
=
1
;
constexpr
index_t
X
=
1
;
constexpr
index_t
HPad
=
0
;
constexpr
index_t
WPad
=
0
;
#elif 1
// 1x1 filter, 73x73 image
constexpr
index_t
N
=
128
;
constexpr
index_t
C
=
64
;
constexpr
index_t
HI
=
73
;
constexpr
index_t
WI
=
73
;
constexpr
index_t
K
=
128
;
constexpr
index_t
Y
=
1
;
constexpr
index_t
X
=
1
;
constexpr
index_t
HPad
=
0
;
constexpr
index_t
WPad
=
0
;
#endif
...
...
@@ -609,8 +617,6 @@ int main(int argc, char* argv[])
};
wei_kcyx
.
GenerateTensorValue
(
gen_wei
,
num_thread
);
#endif
// out_nkhw_device.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
}
#if 1
...
...
@@ -649,7 +655,7 @@ int main(int argc, char* argv[])
if
(
do_verification
)
{
#if
1
#if
0
if(Y == 3 && X == 3)
{
host_winograd_3x3_convolution(in_nchw, wei_kcyx, out_nkhw_host, lower_pads, upper_pads);
...
...
src/include/Array.hip.hpp
View file @
b2439ec9
...
...
@@ -105,6 +105,7 @@ __host__ __device__ constexpr auto extract_array(const Array<TData, NSize>& old_
return
new_array
;
}
// Array = Array + Array
template
<
class
TData
,
index_t
NSize
>
__host__
__device__
constexpr
auto
operator
+
(
Array
<
TData
,
NSize
>
a
,
Array
<
TData
,
NSize
>
b
)
{
...
...
@@ -119,6 +120,55 @@ __host__ __device__ constexpr auto operator+(Array<TData, NSize> a, Array<TData,
return
result
;
}
// Array = Array - Array
template
<
class
TData
,
index_t
NSize
>
__host__
__device__
constexpr
auto
operator
-
(
Array
<
TData
,
NSize
>
a
,
Array
<
TData
,
NSize
>
b
)
{
Array
<
TData
,
NSize
>
result
;
static_for
<
0
,
NSize
,
1
>
{}([
&
](
auto
I
)
{
constexpr
index_t
i
=
I
.
Get
();
result
[
i
]
=
a
[
i
]
-
b
[
i
];
});
return
result
;
}
// Array = Array + Sequence
template
<
class
TData
,
index_t
NSize
,
index_t
...
Is
>
__host__
__device__
constexpr
auto
operator
+
(
Array
<
TData
,
NSize
>
a
,
Sequence
<
Is
...
>
b
)
{
static_assert
(
sizeof
...(
Is
)
==
NSize
,
"wrong! size not the same"
);
Array
<
TData
,
NSize
>
result
;
static_for
<
0
,
NSize
,
1
>
{}([
&
](
auto
I
)
{
constexpr
index_t
i
=
I
.
Get
();
result
[
i
]
=
a
[
i
]
+
b
.
Get
(
I
);
});
return
result
;
}
// Array = Array - Sequence
template
<
class
TData
,
index_t
NSize
,
index_t
...
Is
>
__host__
__device__
constexpr
auto
operator
-
(
Array
<
TData
,
NSize
>
a
,
Sequence
<
Is
...
>
b
)
{
static_assert
(
sizeof
...(
Is
)
==
NSize
,
"wrong! size not the same"
);
Array
<
TData
,
NSize
>
result
;
static_for
<
0
,
NSize
,
1
>
{}([
&
](
auto
I
)
{
constexpr
index_t
i
=
I
.
Get
();
result
[
i
]
=
a
[
i
]
-
b
.
Get
(
I
);
});
return
result
;
}
// Array = Array * Sequence
template
<
class
TData
,
index_t
NSize
,
index_t
...
Is
>
__host__
__device__
constexpr
auto
operator
*
(
Array
<
TData
,
NSize
>
a
,
Sequence
<
Is
...
>
b
)
...
...
@@ -136,15 +186,119 @@ __host__ __device__ constexpr auto operator*(Array<TData, NSize> a, Sequence<Is.
return
result
;
}
template
<
class
TData
,
index_t
NSize
,
class
F
>
__host__
__device__
constexpr
TData
reduce_on_array
(
Array
<
TData
,
NSize
>
a
,
F
f
)
// Array = Sequence - Array
template
<
class
TData
,
index_t
NSize
,
index_t
...
Is
>
__host__
__device__
constexpr
auto
operator
-
(
Sequence
<
Is
...
>
a
,
Array
<
TData
,
NSize
>
b
)
{
TData
result
=
a
[
0
];
static_assert
(
sizeof
...(
Is
)
==
NSize
,
"wrong! size not the same"
);
Array
<
TData
,
NSize
>
result
;
static_for
<
1
,
NSize
,
1
>
{}([
&
](
auto
I
)
{
static_for
<
0
,
NSize
,
1
>
{}([
&
](
auto
I
)
{
constexpr
index_t
i
=
I
.
Get
();
result
[
i
]
=
a
.
Get
(
I
)
-
b
[
i
];
});
return
result
;
}
template
<
class
TData
,
index_t
NSize
,
class
Reduce
>
__host__
__device__
constexpr
TData
accumulate_on_array
(
const
Array
<
TData
,
NSize
>&
a
,
Reduce
f
,
TData
init
)
{
TData
result
=
init
;
static_assert
(
NSize
>
0
,
"wrong"
);
static_for
<
0
,
NSize
,
1
>
{}([
&
](
auto
I
)
{
constexpr
index_t
i
=
I
.
Get
();
result
=
f
(
result
,
a
[
i
]);
});
return
result
;
}
template
<
class
T
,
index_t
NSize
>
__host__
__device__
void
print_Array
(
const
char
*
s
,
Array
<
T
,
NSize
>
a
)
{
constexpr
index_t
nsize
=
a
.
GetSize
();
static_assert
(
nsize
>
0
&&
nsize
<=
10
,
"wrong!"
);
static_if
<
nsize
==
1
>
{}([
&
](
auto
)
{
printf
(
"%s size %u, {%u}
\n
"
,
s
,
nsize
,
a
[
0
]);
});
static_if
<
nsize
==
2
>
{}([
&
](
auto
)
{
printf
(
"%s size %u, {%u %u}
\n
"
,
s
,
nsize
,
a
[
0
],
a
[
1
]);
});
static_if
<
nsize
==
3
>
{}(
[
&
](
auto
)
{
printf
(
"%s size %u, {%u %u %u}
\n
"
,
s
,
nsize
,
a
[
0
],
a
[
1
],
a
[
2
]);
});
static_if
<
nsize
==
4
>
{}(
[
&
](
auto
)
{
printf
(
"%s size %u, {%u %u %u %u}
\n
"
,
s
,
nsize
,
a
[
0
],
a
[
1
],
a
[
2
],
a
[
3
]);
});
static_if
<
nsize
==
5
>
{}([
&
](
auto
)
{
printf
(
"%s size %u, {%u %u %u %u %u}
\n
"
,
s
,
nsize
,
a
[
0
],
a
[
1
],
a
[
2
],
a
[
3
],
a
[
4
]);
});
static_if
<
nsize
==
6
>
{}([
&
](
auto
)
{
printf
(
"%s size %u, {%u %u %u %u %u %u}
\n
"
,
s
,
nsize
,
a
[
0
],
a
[
1
],
a
[
2
],
a
[
3
],
a
[
4
],
a
[
5
]);
});
static_if
<
nsize
==
7
>
{}([
&
](
auto
)
{
printf
(
"%s size %u, {%u %u %u %u %u %u %u}
\n
"
,
s
,
nsize
,
a
[
0
],
a
[
1
],
a
[
2
],
a
[
3
],
a
[
4
],
a
[
5
],
a
[
6
]);
});
static_if
<
nsize
==
8
>
{}([
&
](
auto
)
{
printf
(
"%s size %u, {%u %u %u %u %u %u %u %u}
\n
"
,
s
,
nsize
,
a
[
0
],
a
[
1
],
a
[
2
],
a
[
3
],
a
[
4
],
a
[
5
],
a
[
6
],
a
[
7
]);
});
static_if
<
nsize
==
9
>
{}([
&
](
auto
)
{
printf
(
"%s size %u, {%u %u %u %u %u %u %u %u %u}
\n
"
,
s
,
nsize
,
a
[
0
],
a
[
1
],
a
[
2
],
a
[
3
],
a
[
4
],
a
[
5
],
a
[
6
],
a
[
7
],
a
[
8
]);
});
static_if
<
nsize
==
10
>
{}([
&
](
auto
)
{
printf
(
"%s size %u, {%u %u %u %u %u %u %u %u %u %u}
\n
"
,
s
,
nsize
,
a
[
0
],
a
[
1
],
a
[
2
],
a
[
3
],
a
[
4
],
a
[
5
],
a
[
6
],
a
[
7
],
a
[
8
],
a
[
9
]);
});
}
src/include/ConstantMergedTensorDescriptor.hip.hpp
View file @
b2439ec9
...
...
@@ -99,15 +99,7 @@ struct ConstantMergedTensorDescriptor
return
original_multi_id
;
}
#if 0 // not needed
__host__ __device__ static index_t
GetOffsetFromOriginalMultiIndex(Array<index_t, nOriginalDim> original_multi_id)
{
return OriginalTensorDesc::GetOffsetFromMultiIndex(original_multi_id);
}
#endif
__host__
__device__
static
index_t
GetOffsetFromMultiIndexA
(
Array
<
index_t
,
nDim
>
multi_id
)
__host__
__device__
static
index_t
GetOffsetFromMultiIndex
(
Array
<
index_t
,
nDim
>
multi_id
)
{
const
auto
original_multi_id
=
GetOriginalMultiIndexFromMultiIndex
(
multi_id
);
...
...
@@ -126,38 +118,6 @@ struct ConstantMergedTensorDescriptor
return
dummy_desc
.
GetMultiIndexFrom1dIndex
(
id
);
}
#if 0 // not needed
template <index_t IDim>
__host__ __device__ static index_t GetNewOriginalMultiIndexAfterMovingAlongOneDimension(
Array<index_t, nOriginalDim> old_original_multi_id, Number<IDim>, index_t step_size)
{
auto new_original_multi_id = old_original_multi_id;
// get partial-original-multi-id corresponding to this merged dimension
constexpr auto original_partial_dims = std::get<IDim>(mOriginalDimMergeSeqs);
constexpr auto original_partial_tensor_desc =
OriginalTensorDesc::Extract(original_partial_dims);
auto old_original_partial_multi_id =
extract_array(old_original_mutli_id, original_paritial_dims);
auto new_original_partial_multi_id =
original_partial_tensor_desc.GetNewMultiIndexGivenStepSizeOf1dIndex(
old_original_partial_multi_id, step_size);
// update original-mutli-id
static_for<0, original_dims_partial.GetSize(), 1>{}([&](auto I_) {
constexpr auto I = decltype(I_){};
constexpr index_t idim_original = original_dims_partial.Get(I);
new_original_multi_id[idim_original] = original_multi_id_partial[I.Get()];
});
return new_original_multi_id;
}
#endif
};
template
<
class
OriginalTensorDesc
,
class
...
OriginalDimMergeSeqs
>
...
...
src/include/ConstantTensorDescriptor.hip.hpp
View file @
b2439ec9
...
...
@@ -4,7 +4,7 @@
template
<
class
Lengths
>
__host__
__device__
constexpr
auto
calculate_tensor_strides_default_rank_packed
(
Lengths
)
{
return
reverse_inclusive_scan_sequence
(
Lengths
{}.
PopFront
(),
std
::
multiplies
<
index_t
>
{})
return
reverse_inclusive_scan_sequence
(
Lengths
{}.
PopFront
(),
mod_conv
::
multiplies
<
index_t
>
{})
.
PushBack
(
Number
<
1
>
{});
}
...
...
@@ -95,7 +95,7 @@ struct ConstantTensorDescriptor
__host__
__device__
static
constexpr
index_t
GetElementSize
()
{
return
accumulate_on_sequence
(
Lengths
{},
std
::
multiplies
<
index_t
>
{},
Number
<
1
>
{});
return
accumulate_on_sequence
(
Lengths
{},
mod_conv
::
multiplies
<
index_t
>
{},
Number
<
1
>
{});
}
// WRONG! ReorderGivenOld2New is broken
...
...
@@ -107,10 +107,10 @@ struct ConstantTensorDescriptor
constexpr auto strides_in_rank = GetStrides().ReorderGivenOld2new(MemoryRank{});
constexpr index_t element_space_unaligned = accumulate_on_sequence(
(lengths_in_rank - Number<1>{}) * strides_in_rank,
std
::plus<index_t>{}, Number<1>{});
(lengths_in_rank - Number<1>{}) * strides_in_rank,
mod_conv
::plus<index_t>{}, Number<1>{});
#else
// WRONG! align shouldbe applied to the last memory rank, not the last tensor dimension
constexpr
index_t
element_space_unaligned
=
accumulate_on_sequence
(
(
GetLengths
()
-
Number
<
1
>
{})
*
GetStrides
(),
std
::
plus
<
index_t
>
{},
Number
<
1
>
{});
(
GetLengths
()
-
Number
<
1
>
{})
*
GetStrides
(),
mod_conv
::
plus
<
index_t
>
{},
Number
<
1
>
{});
#endif
return
align
.
Get
()
*
((
element_space_unaligned
+
align
.
Get
()
-
1
)
/
align
.
Get
());
...
...
@@ -144,7 +144,8 @@ struct ConstantTensorDescriptor
constexpr
auto
multi_id
=
Sequence
<
Is
...
>
{};
return
accumulate_on_sequence
(
multi_id
*
GetStrides
(),
std
::
plus
<
index_t
>
{},
Number
<
0
>
{});
return
accumulate_on_sequence
(
multi_id
*
GetStrides
(),
mod_conv
::
plus
<
index_t
>
{},
Number
<
0
>
{});
}
#if 0 // ReorderGivenOld2new is broken
...
...
@@ -197,32 +198,70 @@ struct ConstantTensorDescriptor
// This function doesn't do carry check on the highest dimension, for performance reason.
// It is the user's responsibility to make sure the result "new_mutli_id" is not out-of-bound
// on the highest dimension
template
<
bool
PositiveDirection
>
__host__
__device__
static
Array
<
index_t
,
nDim
>
UpdateMultiIndexGivenStepSizeOf1dIndex
(
Array
<
index_t
,
nDim
>
old_multi_id
,
index_t
step_size_of_1d_index
)
index_t
step_size_of_1d_index
,
integral_constant
<
bool
,
PositiveDirection
>
)
{
auto
new_multi_id
=
old_multi_id
+
GetMultiIndexFrom1dIndex
(
step_size_of_1d_index
);
bool
carry
=
false
;
// do carry check in reversed order, starting from lowest dimension
// don't check the highest dimension
static_for
<
0
,
nDim
-
1
,
1
>
{}([
&
](
auto
IDimReverse
)
{
constexpr
index_t
idim
=
nDim
-
1
-
IDimReverse
.
Get
();
constexpr
auto
IDim
=
Number
<
idim
>
{};
if
(
carry
)
{
++
new_multi_id
[
idim
];
}
carry
=
false
;
if
(
new_multi_id
[
idim
]
>=
GetLength
(
IDim
))
{
new_multi_id
[
idim
]
-=
GetLength
(
IDim
);
carry
=
true
;
}
Array
<
index_t
,
nDim
>
new_multi_id
;
const
auto
step_sizes
=
GetMultiIndexFrom1dIndex
(
step_size_of_1d_index
);
static_if
<
PositiveDirection
>
{}([
&
](
auto
)
{
new_multi_id
=
old_multi_id
+
step_sizes
;
bool
carry
=
false
;
// do carry check in reversed order, starting from lowest dimension
// don't check the highest dimension
static_for
<
0
,
nDim
-
1
,
1
>
{}([
&
](
auto
IDimReverse
)
{
constexpr
index_t
idim
=
nDim
-
1
-
IDimReverse
.
Get
();
constexpr
auto
IDim
=
Number
<
idim
>
{};
if
(
carry
)
{
++
new_multi_id
[
idim
];
}
carry
=
false
;
if
(
new_multi_id
[
idim
]
>=
GetLength
(
IDim
))
{
new_multi_id
[
idim
]
-=
GetLength
(
IDim
);
carry
=
true
;
}
});
}).
Else
([
&
](
auto
)
{
// shift up multi-id to avoid unsigned integer underflow during intermediate
// calculations. After the shift, should have new_multi_id[...] >= 1
new_multi_id
=
old_multi_id
+
(
GetLengths
()
-
step_sizes
);
bool
borrow
=
false
;
// do borrow check in reversed order, starting from lowest dimension
// don't check the highest dimension
static_for
<
0
,
nDim
-
1
,
1
>
{}([
&
](
auto
IDimReverse
)
{
constexpr
index_t
idim
=
nDim
-
1
-
IDimReverse
.
Get
();
constexpr
auto
IDim
=
Number
<
idim
>
{};
if
(
borrow
)
{
--
new_multi_id
[
idim
];
}
borrow
=
false
;
if
(
new_multi_id
[
idim
]
<
GetLength
(
IDim
))
{
new_multi_id
[
idim
]
+=
GetLength
(
IDim
);
borrow
=
true
;
}
});
// shift back down multi-id
// here, should have new_multi_id[...] >= GetLengths()
new_multi_id
=
new_multi_id
-
GetLengths
();
});
return
new_multi_id
;
...
...
@@ -255,7 +294,7 @@ struct ConstantTensorDescriptor
}
template
<
class
...
Ts
>
__host__
__device__
static
constexpr
auto
Inject
(
ConstantTensorDescriptor
<
Ts
...
>
)
__host__
__device__
static
constexpr
auto
Embed
(
ConstantTensorDescriptor
<
Ts
...
>
)
{
using
leaf_tensor
=
ConstantTensorDescriptor
<
Ts
...
>
;
...
...
@@ -290,7 +329,7 @@ struct ConstantTensorDescriptor
constexpr
auto
fold_intervals
=
Sequence
<
FoldIntervals
...
>
{};
constexpr
index_t
fold_intervals_product
=
accumulate_on_sequence
(
fold_intervals
,
std
::
multiplies
<
index_t
>
{},
Number
<
1
>
{});
accumulate_on_sequence
(
fold_intervals
,
mod_conv
::
multiplies
<
index_t
>
{},
Number
<
1
>
{});
constexpr
auto
unfold_length
=
GetLength
(
Number
<
IDim
>
{});
constexpr
auto
unfold_stride
=
GetStride
(
Number
<
IDim
>
{});
...
...
@@ -309,7 +348,7 @@ struct ConstantTensorDescriptor
constexpr
auto
fold_strides
=
Number
<
unfold_stride
>
{}
*
reverse_inclusive_scan_sequence
(
fold_intervals
.
PushBack
(
Number
<
1
>
{}),
std
::
multiplies
<
index_t
>
{});
mod_conv
::
multiplies
<
index_t
>
{});
// folded_ranks
constexpr
auto
fold_ranks
=
...
...
@@ -389,7 +428,7 @@ struct ConstantTensorDescriptor
// unfolded length, stride and rank
constexpr
index_t
unfold_length
=
accumulate_on_sequence
(
GetLengths
().
Extract
(
middle
),
std
::
multiplies
<
index_t
>
{},
Number
<
1
>
{});
GetLengths
().
Extract
(
middle
),
mod_conv
::
multiplies
<
index_t
>
{},
Number
<
1
>
{});
constexpr
index_t
unfold_stride
=
GetStride
(
Number
<
LastUnfoldDim
>
{});
...
...
@@ -472,7 +511,20 @@ __host__ __device__ void print_ConstantTensorDescriptor(TDesc, const char* s)
{
constexpr
index_t
ndim
=
TDesc
::
GetNumOfDimension
();
static_assert
(
ndim
>=
2
&&
ndim
<=
10
,
"wrong!"
);
static_assert
(
ndim
>=
1
&&
ndim
<=
10
,
"wrong!"
);
static_if
<
ndim
==
1
>
{}([
&
](
auto
fwd
)
{
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
desc
=
fwd
(
TDesc
{});
printf
(
"%s dim %u, lengths {%u}, strides {%u}, ranks {%u}
\n
"
,
s
,
desc
.
GetNumOfDimension
(),
desc
.
GetLength
(
I0
),
desc
.
GetStride
(
I0
),
desc
.
GetMemoryRank
(
I0
));
});
static_if
<
ndim
==
2
>
{}([
&
](
auto
fwd
)
{
constexpr
auto
I0
=
Number
<
0
>
{};
...
...
src/include/Sequence.hip.hpp
View file @
b2439ec9
...
...
@@ -495,3 +495,39 @@ __host__ __device__ constexpr auto Sequence<Is...>::Modify(Number<I>, Number<X>)
return
seq_left
.
PushBack
(
Number
<
X
>
{}).
Append
(
seq_right
);
}
template
<
index_t
...
Xs
>
__host__
__device__
void
print_Sequence
(
const
char
*
s
,
Sequence
<
Xs
...
>
)
{
constexpr
index_t
nsize
=
Sequence
<
Xs
...
>::
GetSize
();
static_assert
(
nsize
<=
10
,
"wrong!"
);
static_if
<
nsize
==
0
>
{}([
&
](
auto
)
{
printf
(
"%s size %u, {}
\n
"
,
s
,
nsize
,
Xs
...);
});
static_if
<
nsize
==
1
>
{}([
&
](
auto
)
{
printf
(
"%s size %u, {%u}
\n
"
,
s
,
nsize
,
Xs
...);
});
static_if
<
nsize
==
2
>
{}([
&
](
auto
)
{
printf
(
"%s size %u, {%u %u}
\n
"
,
s
,
nsize
,
Xs
...);
});
static_if
<
nsize
==
3
>
{}([
&
](
auto
)
{
printf
(
"%s size %u, {%u %u %u}
\n
"
,
s
,
nsize
,
Xs
...);
});
static_if
<
nsize
==
4
>
{}([
&
](
auto
)
{
printf
(
"%s size %u, {%u %u %u %u}
\n
"
,
s
,
nsize
,
Xs
...);
});
static_if
<
nsize
==
5
>
{}(
[
&
](
auto
)
{
printf
(
"%s size %u, {%u %u %u %u %u}
\n
"
,
s
,
nsize
,
Xs
...);
});
static_if
<
nsize
==
6
>
{}(
[
&
](
auto
)
{
printf
(
"%s size %u, {%u %u %u %u %u %u}
\n
"
,
s
,
nsize
,
Xs
...);
});
static_if
<
nsize
==
7
>
{}(
[
&
](
auto
)
{
printf
(
"%s size %u, {%u %u %u %u %u %u %u}
\n
"
,
s
,
nsize
,
Xs
...);
});
static_if
<
nsize
==
8
>
{}(
[
&
](
auto
)
{
printf
(
"%s size %u, {%u %u %u %u %u %u %u %u}
\n
"
,
s
,
nsize
,
Xs
...);
});
static_if
<
nsize
==
9
>
{}(
[
&
](
auto
)
{
printf
(
"%s size %u, {%u %u %u %u %u %u %u %u %u}
\n
"
,
s
,
nsize
,
Xs
...);
});
static_if
<
nsize
==
10
>
{}(
[
&
](
auto
)
{
printf
(
"%s size %u, {%u %u %u %u %u %u %u %u %u %u}
\n
"
,
s
,
nsize
,
Xs
...);
});
}
src/include/blockwise_3d_tensor_op.hip.hpp
View file @
b2439ec9
...
...
@@ -158,7 +158,7 @@ struct Blockwise3dTensorCopy3
"wrrong! BlockSize is not big enough for ThreadPerDims!"
);
constexpr
index_t
num_active_thread
=
accumulate_on_sequence
(
ThreadPerDims
{},
std
::
multiplies
<
index_t
>
{},
Number
<
1
>
{});
accumulate_on_sequence
(
ThreadPerDims
{},
mod_conv
::
multiplies
<
index_t
>
{},
Number
<
1
>
{});
if
(
BlockSize
>
num_active_thread
)
{
...
...
src/include/blockwise_4d_tensor_op.hip.hpp
View file @
b2439ec9
...
...
@@ -500,7 +500,7 @@ struct Blockwise4dTensorCopy3
"wrrong! BlockSize is not big enough for ThreadPerDims!"
);
constexpr
index_t
num_active_thread
=
accumulate_on_sequence
(
ThreadPerDims
{},
std
::
multiplies
<
index_t
>
{},
Number
<
1
>
{});
accumulate_on_sequence
(
ThreadPerDims
{},
mod_conv
::
multiplies
<
index_t
>
{},
Number
<
1
>
{});
if
(
BlockSize
>
num_active_thread
)
{
...
...
src/include/blockwise_generic_tensor_slice_op.hip.hpp
View file @
b2439ec9
...
...
@@ -3,6 +3,7 @@
// slice a (normal or merged) tensor, and copy it into another (normal or merged) tensor
// memory layout (ordering of dimensions) can be different between src and dst
// For now, only support SubLengths == 1 on a merged dimension
template
<
index_t
BlockSize
,
class
Float
,
class
SrcDesc
,
...
...
@@ -47,7 +48,7 @@ struct BlockwiseGenericTensorSliceCopy_v1
BlockwiseGenericTensorSliceCopy_v1
(
Array
<
index_t
,
nDim
>
src_block_data_multi_id_begin
,
Array
<
index_t
,
nDim
>
dst_block_data_multi_id_begin
)
{
// check NDim consisten
t
// check NDim consisten
cy
static_assert
(
nDim
==
SrcDesc
::
GetNumOfDimension
()
&&
nDim
==
DstDesc
::
GetNumOfDimension
()
&&
nDim
==
SliceLengths
::
GetSize
()
&&
nDim
==
SubLengths
::
GetSize
()
&&
nDim
==
DataClusterLengths
::
GetSize
()
&&
...
...
@@ -55,7 +56,7 @@ struct BlockwiseGenericTensorSliceCopy_v1
nDim
==
SrcAccessOrder
::
GetSize
()
&&
nDim
==
DstAccessOrder
::
GetSize
(),
"wrong"
);
// check
// check
thread arrange order and read/write access order are valid
static_assert
(
is_valid_sequence_map
<
ThreadClusterArrangeOrder
>::
value
&&
is_valid_sequence_map
<
SrcAccessOrder
>::
value
&&
is_valid_sequence_map
<
DstAccessOrder
>::
value
,
...
...
@@ -140,10 +141,14 @@ struct BlockwiseGenericTensorSliceCopy_v1
});
// complete offset
mThreadSrcOffset
=
reduce_on_array
(
mThreadSrcPartialOffsets
,
std
::
plus
<
index_t
>
{});
mThreadDstOffset
=
reduce_on_array
(
mThreadDstPartialOffsets
,
std
::
plus
<
index_t
>
{});
mThreadSrcOffset
=
accumulate_on_array
(
mThreadSrcPartialOffsets
,
mod_conv
::
plus
<
index_t
>
{},
static_cast
<
index_t
>
(
0
));
mThreadDstOffset
=
accumulate_on_array
(
mThreadDstPartialOffsets
,
mod_conv
::
plus
<
index_t
>
{},
static_cast
<
index_t
>
(
0
));
#if 0
if(get_block_1d_id() == 0)
{
printf("id %5u %5u: "
"src_block_data_multi_id_begin: %u %u %u %u, "
...
...
@@ -279,13 +284,9 @@ struct BlockwiseGenericTensorSliceCopy_v1
// the boundary of the tensor being sliced. This functions doesn't do runtime sanity
// check on out-of-bound slicing window, for performance reason
template
<
index_t
IDim_
,
index_t
StepSize
,
bool
PositiveDirection
>
__device__
void
MoveSlicingWindowOnSourceTensor
(
Number
<
IDim_
>
,
Number
<
StepSize
>
,
integral_constant
<
bool
,
PositiveDirection
>
)
__device__
void
MoveSlicingWindowOnSourceTensor
(
Number
<
IDim_
>
,
Number
<
StepSize
>
,
integral_constant
<
bool
,
PositiveDirection
>
direction
)
{
static_assert
(
PositiveDirection
,
"wrong! only support movement in positive direction for now"
);
constexpr
auto
IDim
=
Number
<
IDim_
>
{};
constexpr
index_t
idim
=
IDim
.
Get
();
...
...
@@ -306,7 +307,7 @@ struct BlockwiseGenericTensorSliceCopy_v1
auto
new_src_partial_original_multi_id
=
src_partial_original_desc
.
UpdateMultiIndexGivenStepSizeOf1dIndex
(
old_src_partial_original_multi_id
,
StepSize
);
old_src_partial_original_multi_id
,
StepSize
,
direction
);
// update "mThreadSrcOriginalMultiId"
static_for
<
0
,
src_partial_original_dims
.
GetSize
(),
1
>
{}([
&
](
auto
I_
)
{
...
...
@@ -328,7 +329,7 @@ struct BlockwiseGenericTensorSliceCopy_v1
mThreadSrcPartialOffsets
[
idim
]
=
new_src_partial_offset
;
// update "mThreadSrcOffset", do "+" before "-" to avoid underflow
mThreadSrcOffset
=
mThreadSrcOffset
+
new_src_partial_offset
-
old_src_partial_offset
;
mThreadSrcOffset
=
(
mThreadSrcOffset
+
new_src_partial_offset
)
-
old_src_partial_offset
;
}).
Else
([
&
](
auto
fwd
)
{
// Logic for non-merged dimension. If you are never going to move the slicing window on
// a merged dimension, then "mThreadSrcOriginalMultiId" and "mThreadSrcPartialOffsets",
...
...
@@ -336,13 +337,25 @@ struct BlockwiseGenericTensorSliceCopy_v1
// should be able to remove these calculations.
// TODO: make sure compiler would actually remove them in this case.
// It is the user's responsiblity to make sure the slicing window will not be moved out
// of the boundary of the tensor being sliced. Otherwise, there might be hazard like
// unsigned integer underflow. That is NO runtime sanity check to prevent the hazard
constexpr
index_t
idim_original
=
SrcDesc
::
GetContainedOriginalDimensions
(
IDim
).
Front
();
mThreadSrcOffset
+=
StepSize
*
SrcDesc
::
GetStride
(
IDim
);
static_if
<
PositiveDirection
>
{}([
&
](
auto
)
{
mThreadSrcOffset
+=
StepSize
*
SrcDesc
::
GetStride
(
IDim
);
mThreadSrcOriginalMultiId
[
idim_original
]
+=
StepSize
;
mThreadSrcOriginalMultiId
[
idim_original
]
+=
StepSize
;
mThreadSrcPartialOffsets
[
idim
]
+=
StepSize
*
SrcDesc
::
GetStride
(
IDim
);
mThreadSrcPartialOffsets
[
idim
]
+=
StepSize
*
SrcDesc
::
GetStride
(
IDim
);
}).
Else
([
&
](
auto
)
{
mThreadSrcOffset
-=
StepSize
*
SrcDesc
::
GetStride
(
IDim
);
mThreadSrcOriginalMultiId
[
idim_original
]
-=
StepSize
;
mThreadSrcPartialOffsets
[
idim
]
-=
StepSize
*
SrcDesc
::
GetStride
(
IDim
);
});
});
}
};
src/include/common.hip.hpp
View file @
b2439ec9
...
...
@@ -39,6 +39,18 @@ struct scales
__host__
__device__
constexpr
T
operator
()(
T
a
)
const
{
return
s
*
a
;
}
};
template
<
class
T
>
struct
plus
{
__host__
__device__
constexpr
T
operator
()(
T
a
,
T
b
)
const
{
return
a
+
b
;
}
};
template
<
class
T
>
struct
multiplies
{
__host__
__device__
constexpr
T
operator
()(
T
a
,
T
b
)
const
{
return
a
*
b
;
}
};
template
<
class
T
>
struct
integer_divide_ceiler
{
...
...
src/include/gridwise_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hip.hpp
View file @
b2439ec9
...
...
@@ -58,6 +58,9 @@ struct GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw
constexpr
auto
I6
=
Number
<
6
>
{};
constexpr
auto
I7
=
Number
<
7
>
{};
constexpr
auto
True
=
integral_constant
<
bool
,
true
>
{};
constexpr
auto
False
=
integral_constant
<
bool
,
false
>
{};
constexpr
auto
in_n_c_h_w_global_desc
=
InGlobalDesc
{};
constexpr
auto
wei_c_y_x_k_global_desc
=
WeiGlobalDesc
{};
constexpr
auto
out_n_k_h_w_global_desc
=
OutGlobalDesc
{};
...
...
@@ -123,7 +126,7 @@ struct GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw
// input blockwise copy
// slice a merged tensor, reorder and copy to a normal tensor
// this copy operator already has blockwise offset built-in
const
auto
blockwise_in_copy
=
BlockwiseGenericTensorSliceCopy_v1
<
auto
blockwise_in_copy
=
BlockwiseGenericTensorSliceCopy_v1
<
BlockSize
,
Float
,
decltype
(
in_c_n1_b_n2_global_merged_desc
),
...
...
@@ -150,20 +153,20 @@ struct GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw
// operator for blockwise copy of weight into LDS
// slice a tensor, and copy it into another tensor
// this copy operator already have blockwise offset built-in
const
auto
blockwise_wei_copy
=
#if
0
auto
blockwise_wei_copy
=
#if
1
BlockwiseGenericTensorSliceCopy_v1
<
BlockSize
,
Float,
decltype(wei_c_k_global_desc),
decltype(wei_c_k_block_desc),
decltype(wei_c_k_block_desc.GetLengths()),
WeiBlockCopySubLengths_C_K,
WeiBlockCopyClusterLengths_C_K,
Sequence<0, 1>, // thread_arrange_order [C, K]
Sequence<0, 1>, // src_access_order [C, K]
Sequence<0, 1>, // dst_access_order [C, K]
WeiBlockCopyDataPerAccess_K,
WeiBlockCopyDataPerAccess_K>(
Float
,
decltype
(
wei_c_k_global_desc
),
decltype
(
wei_c_k_block_desc
),
decltype
(
wei_c_k_block_desc
.
GetLengths
()),
WeiBlockCopySubLengths_C_K
,
WeiBlockCopyClusterLengths_C_K
,
Sequence
<
0
,
1
>
,
// thread_arrange_order [C, K]
Sequence
<
0
,
1
>
,
// src_access_order [C, K]
Sequence
<
0
,
1
>
,
// dst_access_order [C, K]
WeiBlockCopyDataPerAccess_K
,
WeiBlockCopyDataPerAccess_K
>
(
{
0
,
k_block_data_on_global
},
{
0
,
0
});
#else
Blockwise2dTensorCopy3
<
BlockSize
,
...
...
@@ -175,16 +178,14 @@ struct GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw
{
0
,
0
});
#endif
// GEMM definition
// c_mtx += transpose(a_mtx) * b_mtx
// a_mtx[CPerBlock, KPerBlock] is in LDS
// b_mtx[CPerBlocl, N1 * BPerBlock * N2] is in LDS
// c_mtx[KPerBlock, N1 * BPerBlock * N2] is distributed among threads, and saved in
// register
constexpr
auto
a_c_k_block_mtx_desc
=
make_ConstantMatrixDescriptor
(
Number
<
CPerBlock
>
{},
Number
<
KPerBlock
>
{},
Number
<
wei_c_k_block_desc
.
GetStride
(
I0
)
>
{});
// GEMM definition
// c_mtx += transpose(a_mtx) * b_mtx
// a_mtx[CPerBlock, KPerBlock] is in LDS
// b_mtx[CPerBlocl, N1 * BPerBlock * N2] is in LDS
// c_mtx[KPerBlock, N1 * BPerBlock * N2] is distributed among threads, and saved in
// register
constexpr
auto
a_c_k_block_mtx_desc
=
make_ConstantMatrixDescriptor
(
Number
<
CPerBlock
>
{},
Number
<
KPerBlock
>
{},
Number
<
wei_c_k_block_desc
.
GetStride
(
I0
)
>
{});
constexpr
auto
b_c_n1bn2_block_mtx_desc
=
make_ConstantMatrixDescriptor
(
Number
<
CPerBlock
>
{},
...
...
@@ -239,6 +240,7 @@ struct GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw
// zero out threadwise output
threadwise_matrix_set_zero
(
c_k0k2_n1n2_thread_mtx_desc
,
p_out_thread
);
#if 0
// do work
for(index_t y = 0; y < Y; ++y)
{
...
...
@@ -269,6 +271,45 @@ struct GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw
}
}
}
#else
for
(
index_t
y
=
0
;
y
<
Y
;
++
y
)
{
for
(
index_t
x
=
0
;
x
<
X
;
++
x
)
{
// calculate origin of block input and weight tensor on global memory
const
Float
*
p_in_block_on_global
=
p_in_global
+
in_n_c_h_w_global_desc
.
GetOffsetFromMultiIndex
(
0
,
0
,
y
,
x
);
const
Float
*
p_wei_block_on_global
=
p_wei_global
+
wei_c_y_x_k_global_desc
.
GetOffsetFromMultiIndex
(
0
,
y
,
x
,
0
);
for
(
index_t
c_block_data_on_global
=
0
;
c_block_data_on_global
<
C
;
c_block_data_on_global
+=
CPerBlock
)
{
blockwise_in_copy
.
Run
(
p_in_block_on_global
,
p_in_block
);
blockwise_wei_copy
.
Run
(
p_wei_block_on_global
,
p_wei_block
);
__syncthreads
();
blockwise_gemm
.
Run
(
p_wei_block
,
p_in_block
,
p_out_thread
);
__syncthreads
();
// move on C: C_N1_B_N2, C_K
blockwise_in_copy
.
MoveSlicingWindowOnSourceTensor
(
I0
,
Number
<
CPerBlock
>
{},
True
);
blockwise_wei_copy
.
MoveSlicingWindowOnSourceTensor
(
I0
,
Number
<
CPerBlock
>
{},
True
);
}
// reset C
blockwise_in_copy
.
MoveSlicingWindowOnSourceTensor
(
I0
,
Number
<
C
>
{},
False
);
blockwise_wei_copy
.
MoveSlicingWindowOnSourceTensor
(
I0
,
Number
<
C
>
{},
False
);
}
}
#endif
// copy output: register to global memory
{
...
...
src/include/gridwise_convolution_implicit_gemm_v4_nchw_kcyx_nkhw.hip.hpp
View file @
b2439ec9
...
...
@@ -59,7 +59,7 @@ struct GridwiseConvolutionImplicitGemm_v4_nchw_kcyx_nkhw
constexpr
auto
I6
=
Number
<
6
>
{};
constexpr
auto
I7
=
Number
<
7
>
{};
constexpr
auto
T
RUE
=
integral_constant
<
bool
,
true
>
{};
constexpr
auto
T
rue
=
integral_constant
<
bool
,
true
>
{};
constexpr
auto
in_n_c_h_w_global_desc
=
InGlobalDesc
{};
constexpr
auto
wei_k_c_y_x_global_desc
=
WeiGlobalDesc
{};
...
...
@@ -102,9 +102,9 @@ struct GridwiseConvolutionImplicitGemm_v4_nchw_kcyx_nkhw
const
index_t
b_block_data_on_global
=
block_work_multi_id
[
1
]
*
BPerBlock
;
// input tensor
// tensor descriptor in device memory [N0, N1, N2, H, W]
constexpr
auto
in_n0_n1_n2_h_w_global_desc
=
in_n_c_h_w_global_desc
.
Slice
(
I2
,
Number
<
H
i
>
{})
.
Slice
(
I3
,
Number
<
W
i
>
{})
// tensor descriptor in device memory [N0, N1, N2, H
o
, W
o
]
constexpr
auto
in_n0_n1_n2_h_w_global_desc
=
in_n_c_h_w_global_desc
.
Slice
(
I2
,
Number
<
H
o
>
{})
.
Slice
(
I3
,
Number
<
W
o
>
{})
.
Fold
(
I0
,
Number
<
N1
>
{},
Number
<
N2
>
{})
.
Extract
(
Sequence
<
0
,
1
,
2
,
4
,
5
>
{});
...
...
@@ -115,12 +115,23 @@ struct GridwiseConvolutionImplicitGemm_v4_nchw_kcyx_nkhw
// merged tensor descriptor in device memory [E, N1, B, N2], src of blockwise copy
constexpr
auto
in_e_n1_b_n2_global_merged_desc
=
make_ConstantMergedTensorDescriptor
(
in_c_y_x_global_desc
.
Inject
(
in_n0_n1_n2_h_w_global_desc
),
in_c_y_x_global_desc
.
Embed
(
in_n0_n1_n2_h_w_global_desc
),
Sequence
<
0
,
1
,
2
>
{},
Sequence
<
4
>
{},
Sequence
<
3
,
6
,
7
>
{},
Sequence
<
5
>
{});
#if 0
if(get_block_1d_id() == 0 && get_thread_local_1d_id() == 0)
{
print_ConstantTensorDescriptor(in_n0_n1_n2_h_w_global_desc,
"in_n0_n1_n2_h_w_global_desc: ");
print_ConstantTensorDescriptor(in_c_y_x_global_desc, "in_c_y_x_global_desc: ");
print_ConstantMergedTensorDescriptor(in_e_n1_b_n2_global_merged_desc,
"in_e_n1_b_n2_global_merged_desc: ");
}
#endif
// memory layout descriptor in LDS [E, N1, B, N2], dst of blockwise copy
// be careful of LDS alignment
constexpr
auto
in_e_n1_b_n2_block_desc
=
make_ConstantTensorDescriptor_default_rank_aligned
(
...
...
@@ -243,6 +254,31 @@ struct GridwiseConvolutionImplicitGemm_v4_nchw_kcyx_nkhw
// do work
for
(
index_t
e
=
0
;
e
<
E
;
e
+=
EPerBlock
)
{
#if 0
if(e == 1 * EPerBlock && get_block_1d_id() == 0)
{
printf("id %5u %5u: "
"mThreadSrcOriginalMultiId %u %u %u %u %u %u %u %u, "
"mThreadSrcPartialOffsets %u %u %u %u, "
"mThreadSrcOffset %u, mThreadDstOffset %u \n",
get_block_1d_id(),
get_thread_local_1d_id(),
blockwise_in_copy.mThreadSrcOriginalMultiId[0],
blockwise_in_copy.mThreadSrcOriginalMultiId[1],
blockwise_in_copy.mThreadSrcOriginalMultiId[2],
blockwise_in_copy.mThreadSrcOriginalMultiId[3],
blockwise_in_copy.mThreadSrcOriginalMultiId[4],
blockwise_in_copy.mThreadSrcOriginalMultiId[5],
blockwise_in_copy.mThreadSrcOriginalMultiId[6],
blockwise_in_copy.mThreadSrcOriginalMultiId[7],
blockwise_in_copy.mThreadSrcPartialOffsets[0],
blockwise_in_copy.mThreadSrcPartialOffsets[1],
blockwise_in_copy.mThreadSrcPartialOffsets[2],
blockwise_in_copy.mThreadSrcPartialOffsets[3],
blockwise_in_copy.mThreadSrcOffset,
blockwise_in_copy.mThreadDstOffset);
}
#endif
// marching slicing window
blockwise_in_copy
.
Run
(
p_in_global
,
p_in_block
);
blockwise_wei_copy
.
Run
(
p_wei_global
,
p_wei_block
);
...
...
@@ -253,8 +289,8 @@ struct GridwiseConvolutionImplicitGemm_v4_nchw_kcyx_nkhw
__syncthreads
();
blockwise_in_copy
.
MoveSlicingWindowOnSourceTensor
(
I0
,
Number
<
EPerBlock
>
{},
T
RUE
);
blockwise_wei_copy
.
MoveSlicingWindowOnSourceTensor
(
I0
,
Number
<
EPerBlock
>
{},
T
RUE
);
blockwise_in_copy
.
MoveSlicingWindowOnSourceTensor
(
I0
,
Number
<
EPerBlock
>
{},
T
rue
);
blockwise_wei_copy
.
MoveSlicingWindowOnSourceTensor
(
I0
,
Number
<
EPerBlock
>
{},
T
rue
);
}
// copy output: register to global memory
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment