Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
b2439ec9
Commit
b2439ec9
authored
May 30, 2019
by
Chao Liu
Browse files
adding implicit gemm v4 (nchw, kcyx)
parent
0a265731
Changes
11
Show whitespace changes
Inline
Side-by-side
Showing
11 changed files
with
445 additions
and
135 deletions
+445
-135
driver/driver.hip.cpp
driver/driver.hip.cpp
+15
-9
src/include/Array.hip.hpp
src/include/Array.hip.hpp
+158
-4
src/include/ConstantMergedTensorDescriptor.hip.hpp
src/include/ConstantMergedTensorDescriptor.hip.hpp
+1
-41
src/include/ConstantTensorDescriptor.hip.hpp
src/include/ConstantTensorDescriptor.hip.hpp
+85
-33
src/include/Sequence.hip.hpp
src/include/Sequence.hip.hpp
+36
-0
src/include/blockwise_3d_tensor_op.hip.hpp
src/include/blockwise_3d_tensor_op.hip.hpp
+1
-1
src/include/blockwise_4d_tensor_op.hip.hpp
src/include/blockwise_4d_tensor_op.hip.hpp
+1
-1
src/include/blockwise_generic_tensor_slice_op.hip.hpp
src/include/blockwise_generic_tensor_slice_op.hip.hpp
+28
-15
src/include/common.hip.hpp
src/include/common.hip.hpp
+12
-0
src/include/gridwise_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hip.hpp
...dwise_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hip.hpp
+65
-24
src/include/gridwise_convolution_implicit_gemm_v4_nchw_kcyx_nkhw.hip.hpp
...dwise_convolution_implicit_gemm_v4_nchw_kcyx_nkhw.hip.hpp
+43
-7
No files found.
driver/driver.hip.cpp
View file @
b2439ec9
...
@@ -44,11 +44,7 @@ struct GeneratorTensor_3
...
@@ -44,11 +44,7 @@ struct GeneratorTensor_3
{
{
std
::
array
<
index_t
,
sizeof
...(
Is
)
>
dims
=
{{
static_cast
<
index_t
>
(
is
)...}};
std
::
array
<
index_t
,
sizeof
...(
Is
)
>
dims
=
{{
static_cast
<
index_t
>
(
is
)...}};
#if 0
auto f_acc = std::plus<index_t>{};
#else
auto
f_acc
=
[](
auto
a
,
auto
b
)
{
return
100
*
a
+
b
;
};
auto
f_acc
=
[](
auto
a
,
auto
b
)
{
return
100
*
a
+
b
;
};
#endif
return
std
::
accumulate
(
dims
.
begin
(),
dims
.
end
(),
index_t
(
0
),
f_acc
);
return
std
::
accumulate
(
dims
.
begin
(),
dims
.
end
(),
index_t
(
0
),
f_acc
);
}
}
...
@@ -447,7 +443,7 @@ int main(int argc, char* argv[])
...
@@ -447,7 +443,7 @@ int main(int argc, char* argv[])
constexpr
index_t
HPad
=
0
;
constexpr
index_t
HPad
=
0
;
constexpr
index_t
WPad
=
0
;
constexpr
index_t
WPad
=
0
;
#elif
1
#elif
0
// 3x3 filter, 28x28 image
// 3x3 filter, 28x28 image
constexpr
index_t
N
=
128
;
constexpr
index_t
N
=
128
;
constexpr
index_t
C
=
256
;
constexpr
index_t
C
=
256
;
...
@@ -543,7 +539,7 @@ int main(int argc, char* argv[])
...
@@ -543,7 +539,7 @@ int main(int argc, char* argv[])
constexpr
index_t
HPad
=
0
;
constexpr
index_t
HPad
=
0
;
constexpr
index_t
WPad
=
0
;
constexpr
index_t
WPad
=
0
;
#elif
0
#elif
1
// 1x1 filter, 14x14 image
// 1x1 filter, 14x14 image
constexpr
index_t
N
=
128
;
constexpr
index_t
N
=
128
;
constexpr
index_t
C
=
512
;
constexpr
index_t
C
=
512
;
...
@@ -553,6 +549,18 @@ int main(int argc, char* argv[])
...
@@ -553,6 +549,18 @@ int main(int argc, char* argv[])
constexpr
index_t
Y
=
1
;
constexpr
index_t
Y
=
1
;
constexpr
index_t
X
=
1
;
constexpr
index_t
X
=
1
;
constexpr
index_t
HPad
=
0
;
constexpr
index_t
WPad
=
0
;
#elif 1
// 1x1 filter, 73x73 image
constexpr
index_t
N
=
128
;
constexpr
index_t
C
=
64
;
constexpr
index_t
HI
=
73
;
constexpr
index_t
WI
=
73
;
constexpr
index_t
K
=
128
;
constexpr
index_t
Y
=
1
;
constexpr
index_t
X
=
1
;
constexpr
index_t
HPad
=
0
;
constexpr
index_t
HPad
=
0
;
constexpr
index_t
WPad
=
0
;
constexpr
index_t
WPad
=
0
;
#endif
#endif
...
@@ -609,8 +617,6 @@ int main(int argc, char* argv[])
...
@@ -609,8 +617,6 @@ int main(int argc, char* argv[])
};
};
wei_kcyx
.
GenerateTensorValue
(
gen_wei
,
num_thread
);
wei_kcyx
.
GenerateTensorValue
(
gen_wei
,
num_thread
);
#endif
#endif
// out_nkhw_device.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
}
}
#if 1
#if 1
...
@@ -649,7 +655,7 @@ int main(int argc, char* argv[])
...
@@ -649,7 +655,7 @@ int main(int argc, char* argv[])
if
(
do_verification
)
if
(
do_verification
)
{
{
#if
1
#if
0
if(Y == 3 && X == 3)
if(Y == 3 && X == 3)
{
{
host_winograd_3x3_convolution(in_nchw, wei_kcyx, out_nkhw_host, lower_pads, upper_pads);
host_winograd_3x3_convolution(in_nchw, wei_kcyx, out_nkhw_host, lower_pads, upper_pads);
...
...
src/include/Array.hip.hpp
View file @
b2439ec9
...
@@ -105,6 +105,7 @@ __host__ __device__ constexpr auto extract_array(const Array<TData, NSize>& old_
...
@@ -105,6 +105,7 @@ __host__ __device__ constexpr auto extract_array(const Array<TData, NSize>& old_
return
new_array
;
return
new_array
;
}
}
// Array = Array + Array
template
<
class
TData
,
index_t
NSize
>
template
<
class
TData
,
index_t
NSize
>
__host__
__device__
constexpr
auto
operator
+
(
Array
<
TData
,
NSize
>
a
,
Array
<
TData
,
NSize
>
b
)
__host__
__device__
constexpr
auto
operator
+
(
Array
<
TData
,
NSize
>
a
,
Array
<
TData
,
NSize
>
b
)
{
{
...
@@ -119,6 +120,55 @@ __host__ __device__ constexpr auto operator+(Array<TData, NSize> a, Array<TData,
...
@@ -119,6 +120,55 @@ __host__ __device__ constexpr auto operator+(Array<TData, NSize> a, Array<TData,
return
result
;
return
result
;
}
}
// Array = Array - Array
template
<
class
TData
,
index_t
NSize
>
__host__
__device__
constexpr
auto
operator
-
(
Array
<
TData
,
NSize
>
a
,
Array
<
TData
,
NSize
>
b
)
{
Array
<
TData
,
NSize
>
result
;
static_for
<
0
,
NSize
,
1
>
{}([
&
](
auto
I
)
{
constexpr
index_t
i
=
I
.
Get
();
result
[
i
]
=
a
[
i
]
-
b
[
i
];
});
return
result
;
}
// Array = Array + Sequence
template
<
class
TData
,
index_t
NSize
,
index_t
...
Is
>
__host__
__device__
constexpr
auto
operator
+
(
Array
<
TData
,
NSize
>
a
,
Sequence
<
Is
...
>
b
)
{
static_assert
(
sizeof
...(
Is
)
==
NSize
,
"wrong! size not the same"
);
Array
<
TData
,
NSize
>
result
;
static_for
<
0
,
NSize
,
1
>
{}([
&
](
auto
I
)
{
constexpr
index_t
i
=
I
.
Get
();
result
[
i
]
=
a
[
i
]
+
b
.
Get
(
I
);
});
return
result
;
}
// Array = Array - Sequence
template
<
class
TData
,
index_t
NSize
,
index_t
...
Is
>
__host__
__device__
constexpr
auto
operator
-
(
Array
<
TData
,
NSize
>
a
,
Sequence
<
Is
...
>
b
)
{
static_assert
(
sizeof
...(
Is
)
==
NSize
,
"wrong! size not the same"
);
Array
<
TData
,
NSize
>
result
;
static_for
<
0
,
NSize
,
1
>
{}([
&
](
auto
I
)
{
constexpr
index_t
i
=
I
.
Get
();
result
[
i
]
=
a
[
i
]
-
b
.
Get
(
I
);
});
return
result
;
}
// Array = Array * Sequence
// Array = Array * Sequence
template
<
class
TData
,
index_t
NSize
,
index_t
...
Is
>
template
<
class
TData
,
index_t
NSize
,
index_t
...
Is
>
__host__
__device__
constexpr
auto
operator
*
(
Array
<
TData
,
NSize
>
a
,
Sequence
<
Is
...
>
b
)
__host__
__device__
constexpr
auto
operator
*
(
Array
<
TData
,
NSize
>
a
,
Sequence
<
Is
...
>
b
)
...
@@ -136,15 +186,119 @@ __host__ __device__ constexpr auto operator*(Array<TData, NSize> a, Sequence<Is.
...
@@ -136,15 +186,119 @@ __host__ __device__ constexpr auto operator*(Array<TData, NSize> a, Sequence<Is.
return
result
;
return
result
;
}
}
template
<
class
TData
,
index_t
NSize
,
class
F
>
// Array = Sequence - Array
__host__
__device__
constexpr
TData
reduce_on_array
(
Array
<
TData
,
NSize
>
a
,
F
f
)
template
<
class
TData
,
index_t
NSize
,
index_t
...
Is
>
__host__
__device__
constexpr
auto
operator
-
(
Sequence
<
Is
...
>
a
,
Array
<
TData
,
NSize
>
b
)
{
{
TData
result
=
a
[
0
];
static_assert
(
sizeof
...(
Is
)
==
NSize
,
"wrong! size not the same"
);
Array
<
TData
,
NSize
>
result
;
static_for
<
1
,
NSize
,
1
>
{}([
&
](
auto
I
)
{
static_for
<
0
,
NSize
,
1
>
{}([
&
](
auto
I
)
{
constexpr
index_t
i
=
I
.
Get
();
result
[
i
]
=
a
.
Get
(
I
)
-
b
[
i
];
});
return
result
;
}
template
<
class
TData
,
index_t
NSize
,
class
Reduce
>
__host__
__device__
constexpr
TData
accumulate_on_array
(
const
Array
<
TData
,
NSize
>&
a
,
Reduce
f
,
TData
init
)
{
TData
result
=
init
;
static_assert
(
NSize
>
0
,
"wrong"
);
static_for
<
0
,
NSize
,
1
>
{}([
&
](
auto
I
)
{
constexpr
index_t
i
=
I
.
Get
();
constexpr
index_t
i
=
I
.
Get
();
result
=
f
(
result
,
a
[
i
]);
result
=
f
(
result
,
a
[
i
]);
});
});
return
result
;
return
result
;
}
}
template
<
class
T
,
index_t
NSize
>
__host__
__device__
void
print_Array
(
const
char
*
s
,
Array
<
T
,
NSize
>
a
)
{
constexpr
index_t
nsize
=
a
.
GetSize
();
static_assert
(
nsize
>
0
&&
nsize
<=
10
,
"wrong!"
);
static_if
<
nsize
==
1
>
{}([
&
](
auto
)
{
printf
(
"%s size %u, {%u}
\n
"
,
s
,
nsize
,
a
[
0
]);
});
static_if
<
nsize
==
2
>
{}([
&
](
auto
)
{
printf
(
"%s size %u, {%u %u}
\n
"
,
s
,
nsize
,
a
[
0
],
a
[
1
]);
});
static_if
<
nsize
==
3
>
{}(
[
&
](
auto
)
{
printf
(
"%s size %u, {%u %u %u}
\n
"
,
s
,
nsize
,
a
[
0
],
a
[
1
],
a
[
2
]);
});
static_if
<
nsize
==
4
>
{}(
[
&
](
auto
)
{
printf
(
"%s size %u, {%u %u %u %u}
\n
"
,
s
,
nsize
,
a
[
0
],
a
[
1
],
a
[
2
],
a
[
3
]);
});
static_if
<
nsize
==
5
>
{}([
&
](
auto
)
{
printf
(
"%s size %u, {%u %u %u %u %u}
\n
"
,
s
,
nsize
,
a
[
0
],
a
[
1
],
a
[
2
],
a
[
3
],
a
[
4
]);
});
static_if
<
nsize
==
6
>
{}([
&
](
auto
)
{
printf
(
"%s size %u, {%u %u %u %u %u %u}
\n
"
,
s
,
nsize
,
a
[
0
],
a
[
1
],
a
[
2
],
a
[
3
],
a
[
4
],
a
[
5
]);
});
static_if
<
nsize
==
7
>
{}([
&
](
auto
)
{
printf
(
"%s size %u, {%u %u %u %u %u %u %u}
\n
"
,
s
,
nsize
,
a
[
0
],
a
[
1
],
a
[
2
],
a
[
3
],
a
[
4
],
a
[
5
],
a
[
6
]);
});
static_if
<
nsize
==
8
>
{}([
&
](
auto
)
{
printf
(
"%s size %u, {%u %u %u %u %u %u %u %u}
\n
"
,
s
,
nsize
,
a
[
0
],
a
[
1
],
a
[
2
],
a
[
3
],
a
[
4
],
a
[
5
],
a
[
6
],
a
[
7
]);
});
static_if
<
nsize
==
9
>
{}([
&
](
auto
)
{
printf
(
"%s size %u, {%u %u %u %u %u %u %u %u %u}
\n
"
,
s
,
nsize
,
a
[
0
],
a
[
1
],
a
[
2
],
a
[
3
],
a
[
4
],
a
[
5
],
a
[
6
],
a
[
7
],
a
[
8
]);
});
static_if
<
nsize
==
10
>
{}([
&
](
auto
)
{
printf
(
"%s size %u, {%u %u %u %u %u %u %u %u %u %u}
\n
"
,
s
,
nsize
,
a
[
0
],
a
[
1
],
a
[
2
],
a
[
3
],
a
[
4
],
a
[
5
],
a
[
6
],
a
[
7
],
a
[
8
],
a
[
9
]);
});
}
src/include/ConstantMergedTensorDescriptor.hip.hpp
View file @
b2439ec9
...
@@ -99,15 +99,7 @@ struct ConstantMergedTensorDescriptor
...
@@ -99,15 +99,7 @@ struct ConstantMergedTensorDescriptor
return
original_multi_id
;
return
original_multi_id
;
}
}
#if 0 // not needed
__host__
__device__
static
index_t
GetOffsetFromMultiIndex
(
Array
<
index_t
,
nDim
>
multi_id
)
__host__ __device__ static index_t
GetOffsetFromOriginalMultiIndex(Array<index_t, nOriginalDim> original_multi_id)
{
return OriginalTensorDesc::GetOffsetFromMultiIndex(original_multi_id);
}
#endif
__host__
__device__
static
index_t
GetOffsetFromMultiIndexA
(
Array
<
index_t
,
nDim
>
multi_id
)
{
{
const
auto
original_multi_id
=
GetOriginalMultiIndexFromMultiIndex
(
multi_id
);
const
auto
original_multi_id
=
GetOriginalMultiIndexFromMultiIndex
(
multi_id
);
...
@@ -126,38 +118,6 @@ struct ConstantMergedTensorDescriptor
...
@@ -126,38 +118,6 @@ struct ConstantMergedTensorDescriptor
return
dummy_desc
.
GetMultiIndexFrom1dIndex
(
id
);
return
dummy_desc
.
GetMultiIndexFrom1dIndex
(
id
);
}
}
#if 0 // not needed
template <index_t IDim>
__host__ __device__ static index_t GetNewOriginalMultiIndexAfterMovingAlongOneDimension(
Array<index_t, nOriginalDim> old_original_multi_id, Number<IDim>, index_t step_size)
{
auto new_original_multi_id = old_original_multi_id;
// get partial-original-multi-id corresponding to this merged dimension
constexpr auto original_partial_dims = std::get<IDim>(mOriginalDimMergeSeqs);
constexpr auto original_partial_tensor_desc =
OriginalTensorDesc::Extract(original_partial_dims);
auto old_original_partial_multi_id =
extract_array(old_original_mutli_id, original_paritial_dims);
auto new_original_partial_multi_id =
original_partial_tensor_desc.GetNewMultiIndexGivenStepSizeOf1dIndex(
old_original_partial_multi_id, step_size);
// update original-mutli-id
static_for<0, original_dims_partial.GetSize(), 1>{}([&](auto I_) {
constexpr auto I = decltype(I_){};
constexpr index_t idim_original = original_dims_partial.Get(I);
new_original_multi_id[idim_original] = original_multi_id_partial[I.Get()];
});
return new_original_multi_id;
}
#endif
};
};
template
<
class
OriginalTensorDesc
,
class
...
OriginalDimMergeSeqs
>
template
<
class
OriginalTensorDesc
,
class
...
OriginalDimMergeSeqs
>
...
...
src/include/ConstantTensorDescriptor.hip.hpp
View file @
b2439ec9
...
@@ -4,7 +4,7 @@
...
@@ -4,7 +4,7 @@
template
<
class
Lengths
>
template
<
class
Lengths
>
__host__
__device__
constexpr
auto
calculate_tensor_strides_default_rank_packed
(
Lengths
)
__host__
__device__
constexpr
auto
calculate_tensor_strides_default_rank_packed
(
Lengths
)
{
{
return
reverse_inclusive_scan_sequence
(
Lengths
{}.
PopFront
(),
std
::
multiplies
<
index_t
>
{})
return
reverse_inclusive_scan_sequence
(
Lengths
{}.
PopFront
(),
mod_conv
::
multiplies
<
index_t
>
{})
.
PushBack
(
Number
<
1
>
{});
.
PushBack
(
Number
<
1
>
{});
}
}
...
@@ -95,7 +95,7 @@ struct ConstantTensorDescriptor
...
@@ -95,7 +95,7 @@ struct ConstantTensorDescriptor
__host__
__device__
static
constexpr
index_t
GetElementSize
()
__host__
__device__
static
constexpr
index_t
GetElementSize
()
{
{
return
accumulate_on_sequence
(
Lengths
{},
std
::
multiplies
<
index_t
>
{},
Number
<
1
>
{});
return
accumulate_on_sequence
(
Lengths
{},
mod_conv
::
multiplies
<
index_t
>
{},
Number
<
1
>
{});
}
}
// WRONG! ReorderGivenOld2New is broken
// WRONG! ReorderGivenOld2New is broken
...
@@ -107,10 +107,10 @@ struct ConstantTensorDescriptor
...
@@ -107,10 +107,10 @@ struct ConstantTensorDescriptor
constexpr auto strides_in_rank = GetStrides().ReorderGivenOld2new(MemoryRank{});
constexpr auto strides_in_rank = GetStrides().ReorderGivenOld2new(MemoryRank{});
constexpr index_t element_space_unaligned = accumulate_on_sequence(
constexpr index_t element_space_unaligned = accumulate_on_sequence(
(lengths_in_rank - Number<1>{}) * strides_in_rank,
std
::plus<index_t>{}, Number<1>{});
(lengths_in_rank - Number<1>{}) * strides_in_rank,
mod_conv
::plus<index_t>{}, Number<1>{});
#else
// WRONG! align shouldbe applied to the last memory rank, not the last tensor dimension
#else
// WRONG! align shouldbe applied to the last memory rank, not the last tensor dimension
constexpr
index_t
element_space_unaligned
=
accumulate_on_sequence
(
constexpr
index_t
element_space_unaligned
=
accumulate_on_sequence
(
(
GetLengths
()
-
Number
<
1
>
{})
*
GetStrides
(),
std
::
plus
<
index_t
>
{},
Number
<
1
>
{});
(
GetLengths
()
-
Number
<
1
>
{})
*
GetStrides
(),
mod_conv
::
plus
<
index_t
>
{},
Number
<
1
>
{});
#endif
#endif
return
align
.
Get
()
*
((
element_space_unaligned
+
align
.
Get
()
-
1
)
/
align
.
Get
());
return
align
.
Get
()
*
((
element_space_unaligned
+
align
.
Get
()
-
1
)
/
align
.
Get
());
...
@@ -144,7 +144,8 @@ struct ConstantTensorDescriptor
...
@@ -144,7 +144,8 @@ struct ConstantTensorDescriptor
constexpr
auto
multi_id
=
Sequence
<
Is
...
>
{};
constexpr
auto
multi_id
=
Sequence
<
Is
...
>
{};
return
accumulate_on_sequence
(
multi_id
*
GetStrides
(),
std
::
plus
<
index_t
>
{},
Number
<
0
>
{});
return
accumulate_on_sequence
(
multi_id
*
GetStrides
(),
mod_conv
::
plus
<
index_t
>
{},
Number
<
0
>
{});
}
}
#if 0 // ReorderGivenOld2new is broken
#if 0 // ReorderGivenOld2new is broken
...
@@ -197,11 +198,18 @@ struct ConstantTensorDescriptor
...
@@ -197,11 +198,18 @@ struct ConstantTensorDescriptor
// This function doesn't do carry check on the highest dimension, for performance reason.
// This function doesn't do carry check on the highest dimension, for performance reason.
// It is the user's responsibility to make sure the result "new_mutli_id" is not out-of-bound
// It is the user's responsibility to make sure the result "new_mutli_id" is not out-of-bound
// on the highest dimension
// on the highest dimension
template
<
bool
PositiveDirection
>
__host__
__device__
static
Array
<
index_t
,
nDim
>
__host__
__device__
static
Array
<
index_t
,
nDim
>
UpdateMultiIndexGivenStepSizeOf1dIndex
(
Array
<
index_t
,
nDim
>
old_multi_id
,
UpdateMultiIndexGivenStepSizeOf1dIndex
(
Array
<
index_t
,
nDim
>
old_multi_id
,
index_t
step_size_of_1d_index
)
index_t
step_size_of_1d_index
,
integral_constant
<
bool
,
PositiveDirection
>
)
{
{
auto
new_multi_id
=
old_multi_id
+
GetMultiIndexFrom1dIndex
(
step_size_of_1d_index
);
Array
<
index_t
,
nDim
>
new_multi_id
;
const
auto
step_sizes
=
GetMultiIndexFrom1dIndex
(
step_size_of_1d_index
);
static_if
<
PositiveDirection
>
{}([
&
](
auto
)
{
new_multi_id
=
old_multi_id
+
step_sizes
;
bool
carry
=
false
;
bool
carry
=
false
;
...
@@ -224,6 +232,37 @@ struct ConstantTensorDescriptor
...
@@ -224,6 +232,37 @@ struct ConstantTensorDescriptor
carry
=
true
;
carry
=
true
;
}
}
});
});
}).
Else
([
&
](
auto
)
{
// shift up multi-id to avoid unsigned integer underflow during intermediate
// calculations. After the shift, should have new_multi_id[...] >= 1
new_multi_id
=
old_multi_id
+
(
GetLengths
()
-
step_sizes
);
bool
borrow
=
false
;
// do borrow check in reversed order, starting from lowest dimension
// don't check the highest dimension
static_for
<
0
,
nDim
-
1
,
1
>
{}([
&
](
auto
IDimReverse
)
{
constexpr
index_t
idim
=
nDim
-
1
-
IDimReverse
.
Get
();
constexpr
auto
IDim
=
Number
<
idim
>
{};
if
(
borrow
)
{
--
new_multi_id
[
idim
];
}
borrow
=
false
;
if
(
new_multi_id
[
idim
]
<
GetLength
(
IDim
))
{
new_multi_id
[
idim
]
+=
GetLength
(
IDim
);
borrow
=
true
;
}
});
// shift back down multi-id
// here, should have new_multi_id[...] >= GetLengths()
new_multi_id
=
new_multi_id
-
GetLengths
();
});
return
new_multi_id
;
return
new_multi_id
;
}
}
...
@@ -255,7 +294,7 @@ struct ConstantTensorDescriptor
...
@@ -255,7 +294,7 @@ struct ConstantTensorDescriptor
}
}
template
<
class
...
Ts
>
template
<
class
...
Ts
>
__host__
__device__
static
constexpr
auto
Inject
(
ConstantTensorDescriptor
<
Ts
...
>
)
__host__
__device__
static
constexpr
auto
Embed
(
ConstantTensorDescriptor
<
Ts
...
>
)
{
{
using
leaf_tensor
=
ConstantTensorDescriptor
<
Ts
...
>
;
using
leaf_tensor
=
ConstantTensorDescriptor
<
Ts
...
>
;
...
@@ -290,7 +329,7 @@ struct ConstantTensorDescriptor
...
@@ -290,7 +329,7 @@ struct ConstantTensorDescriptor
constexpr
auto
fold_intervals
=
Sequence
<
FoldIntervals
...
>
{};
constexpr
auto
fold_intervals
=
Sequence
<
FoldIntervals
...
>
{};
constexpr
index_t
fold_intervals_product
=
constexpr
index_t
fold_intervals_product
=
accumulate_on_sequence
(
fold_intervals
,
std
::
multiplies
<
index_t
>
{},
Number
<
1
>
{});
accumulate_on_sequence
(
fold_intervals
,
mod_conv
::
multiplies
<
index_t
>
{},
Number
<
1
>
{});
constexpr
auto
unfold_length
=
GetLength
(
Number
<
IDim
>
{});
constexpr
auto
unfold_length
=
GetLength
(
Number
<
IDim
>
{});
constexpr
auto
unfold_stride
=
GetStride
(
Number
<
IDim
>
{});
constexpr
auto
unfold_stride
=
GetStride
(
Number
<
IDim
>
{});
...
@@ -309,7 +348,7 @@ struct ConstantTensorDescriptor
...
@@ -309,7 +348,7 @@ struct ConstantTensorDescriptor
constexpr
auto
fold_strides
=
constexpr
auto
fold_strides
=
Number
<
unfold_stride
>
{}
*
Number
<
unfold_stride
>
{}
*
reverse_inclusive_scan_sequence
(
fold_intervals
.
PushBack
(
Number
<
1
>
{}),
reverse_inclusive_scan_sequence
(
fold_intervals
.
PushBack
(
Number
<
1
>
{}),
std
::
multiplies
<
index_t
>
{});
mod_conv
::
multiplies
<
index_t
>
{});
// folded_ranks
// folded_ranks
constexpr
auto
fold_ranks
=
constexpr
auto
fold_ranks
=
...
@@ -389,7 +428,7 @@ struct ConstantTensorDescriptor
...
@@ -389,7 +428,7 @@ struct ConstantTensorDescriptor
// unfolded length, stride and rank
// unfolded length, stride and rank
constexpr
index_t
unfold_length
=
accumulate_on_sequence
(
constexpr
index_t
unfold_length
=
accumulate_on_sequence
(
GetLengths
().
Extract
(
middle
),
std
::
multiplies
<
index_t
>
{},
Number
<
1
>
{});
GetLengths
().
Extract
(
middle
),
mod_conv
::
multiplies
<
index_t
>
{},
Number
<
1
>
{});
constexpr
index_t
unfold_stride
=
GetStride
(
Number
<
LastUnfoldDim
>
{});
constexpr
index_t
unfold_stride
=
GetStride
(
Number
<
LastUnfoldDim
>
{});
...
@@ -472,7 +511,20 @@ __host__ __device__ void print_ConstantTensorDescriptor(TDesc, const char* s)
...
@@ -472,7 +511,20 @@ __host__ __device__ void print_ConstantTensorDescriptor(TDesc, const char* s)
{
{
constexpr
index_t
ndim
=
TDesc
::
GetNumOfDimension
();
constexpr
index_t
ndim
=
TDesc
::
GetNumOfDimension
();
static_assert
(
ndim
>=
2
&&
ndim
<=
10
,
"wrong!"
);
static_assert
(
ndim
>=
1
&&
ndim
<=
10
,
"wrong!"
);
static_if
<
ndim
==
1
>
{}([
&
](
auto
fwd
)
{
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
desc
=
fwd
(
TDesc
{});
printf
(
"%s dim %u, lengths {%u}, strides {%u}, ranks {%u}
\n
"
,
s
,
desc
.
GetNumOfDimension
(),
desc
.
GetLength
(
I0
),
desc
.
GetStride
(
I0
),
desc
.
GetMemoryRank
(
I0
));
});
static_if
<
ndim
==
2
>
{}([
&
](
auto
fwd
)
{
static_if
<
ndim
==
2
>
{}([
&
](
auto
fwd
)
{
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I0
=
Number
<
0
>
{};
...
...
src/include/Sequence.hip.hpp
View file @
b2439ec9
...
@@ -495,3 +495,39 @@ __host__ __device__ constexpr auto Sequence<Is...>::Modify(Number<I>, Number<X>)
...
@@ -495,3 +495,39 @@ __host__ __device__ constexpr auto Sequence<Is...>::Modify(Number<I>, Number<X>)
return
seq_left
.
PushBack
(
Number
<
X
>
{}).
Append
(
seq_right
);
return
seq_left
.
PushBack
(
Number
<
X
>
{}).
Append
(
seq_right
);
}
}
template
<
index_t
...
Xs
>
__host__
__device__
void
print_Sequence
(
const
char
*
s
,
Sequence
<
Xs
...
>
)
{
constexpr
index_t
nsize
=
Sequence
<
Xs
...
>::
GetSize
();
static_assert
(
nsize
<=
10
,
"wrong!"
);
static_if
<
nsize
==
0
>
{}([
&
](
auto
)
{
printf
(
"%s size %u, {}
\n
"
,
s
,
nsize
,
Xs
...);
});
static_if
<
nsize
==
1
>
{}([
&
](
auto
)
{
printf
(
"%s size %u, {%u}
\n
"
,
s
,
nsize
,
Xs
...);
});
static_if
<
nsize
==
2
>
{}([
&
](
auto
)
{
printf
(
"%s size %u, {%u %u}
\n
"
,
s
,
nsize
,
Xs
...);
});
static_if
<
nsize
==
3
>
{}([
&
](
auto
)
{
printf
(
"%s size %u, {%u %u %u}
\n
"
,
s
,
nsize
,
Xs
...);
});
static_if
<
nsize
==
4
>
{}([
&
](
auto
)
{
printf
(
"%s size %u, {%u %u %u %u}
\n
"
,
s
,
nsize
,
Xs
...);
});
static_if
<
nsize
==
5
>
{}(
[
&
](
auto
)
{
printf
(
"%s size %u, {%u %u %u %u %u}
\n
"
,
s
,
nsize
,
Xs
...);
});
static_if
<
nsize
==
6
>
{}(
[
&
](
auto
)
{
printf
(
"%s size %u, {%u %u %u %u %u %u}
\n
"
,
s
,
nsize
,
Xs
...);
});
static_if
<
nsize
==
7
>
{}(
[
&
](
auto
)
{
printf
(
"%s size %u, {%u %u %u %u %u %u %u}
\n
"
,
s
,
nsize
,
Xs
...);
});
static_if
<
nsize
==
8
>
{}(
[
&
](
auto
)
{
printf
(
"%s size %u, {%u %u %u %u %u %u %u %u}
\n
"
,
s
,
nsize
,
Xs
...);
});
static_if
<
nsize
==
9
>
{}(
[
&
](
auto
)
{
printf
(
"%s size %u, {%u %u %u %u %u %u %u %u %u}
\n
"
,
s
,
nsize
,
Xs
...);
});
static_if
<
nsize
==
10
>
{}(
[
&
](
auto
)
{
printf
(
"%s size %u, {%u %u %u %u %u %u %u %u %u %u}
\n
"
,
s
,
nsize
,
Xs
...);
});
}
src/include/blockwise_3d_tensor_op.hip.hpp
View file @
b2439ec9
...
@@ -158,7 +158,7 @@ struct Blockwise3dTensorCopy3
...
@@ -158,7 +158,7 @@ struct Blockwise3dTensorCopy3
"wrrong! BlockSize is not big enough for ThreadPerDims!"
);
"wrrong! BlockSize is not big enough for ThreadPerDims!"
);
constexpr
index_t
num_active_thread
=
constexpr
index_t
num_active_thread
=
accumulate_on_sequence
(
ThreadPerDims
{},
std
::
multiplies
<
index_t
>
{},
Number
<
1
>
{});
accumulate_on_sequence
(
ThreadPerDims
{},
mod_conv
::
multiplies
<
index_t
>
{},
Number
<
1
>
{});
if
(
BlockSize
>
num_active_thread
)
if
(
BlockSize
>
num_active_thread
)
{
{
...
...
src/include/blockwise_4d_tensor_op.hip.hpp
View file @
b2439ec9
...
@@ -500,7 +500,7 @@ struct Blockwise4dTensorCopy3
...
@@ -500,7 +500,7 @@ struct Blockwise4dTensorCopy3
"wrrong! BlockSize is not big enough for ThreadPerDims!"
);
"wrrong! BlockSize is not big enough for ThreadPerDims!"
);
constexpr
index_t
num_active_thread
=
constexpr
index_t
num_active_thread
=
accumulate_on_sequence
(
ThreadPerDims
{},
std
::
multiplies
<
index_t
>
{},
Number
<
1
>
{});
accumulate_on_sequence
(
ThreadPerDims
{},
mod_conv
::
multiplies
<
index_t
>
{},
Number
<
1
>
{});
if
(
BlockSize
>
num_active_thread
)
if
(
BlockSize
>
num_active_thread
)
{
{
...
...
src/include/blockwise_generic_tensor_slice_op.hip.hpp
View file @
b2439ec9
...
@@ -3,6 +3,7 @@
...
@@ -3,6 +3,7 @@
// slice a (normal or merged) tensor, and copy it into another (normal or merged) tensor
// slice a (normal or merged) tensor, and copy it into another (normal or merged) tensor
// memory layout (ordering of dimensions) can be different between src and dst
// memory layout (ordering of dimensions) can be different between src and dst
// For now, only support SubLengths == 1 on a merged dimension
template
<
index_t
BlockSize
,
template
<
index_t
BlockSize
,
class
Float
,
class
Float
,
class
SrcDesc
,
class
SrcDesc
,
...
@@ -47,7 +48,7 @@ struct BlockwiseGenericTensorSliceCopy_v1
...
@@ -47,7 +48,7 @@ struct BlockwiseGenericTensorSliceCopy_v1
BlockwiseGenericTensorSliceCopy_v1
(
Array
<
index_t
,
nDim
>
src_block_data_multi_id_begin
,
BlockwiseGenericTensorSliceCopy_v1
(
Array
<
index_t
,
nDim
>
src_block_data_multi_id_begin
,
Array
<
index_t
,
nDim
>
dst_block_data_multi_id_begin
)
Array
<
index_t
,
nDim
>
dst_block_data_multi_id_begin
)
{
{
// check NDim consisten
t
// check NDim consisten
cy
static_assert
(
nDim
==
SrcDesc
::
GetNumOfDimension
()
&&
static_assert
(
nDim
==
SrcDesc
::
GetNumOfDimension
()
&&
nDim
==
DstDesc
::
GetNumOfDimension
()
&&
nDim
==
SliceLengths
::
GetSize
()
&&
nDim
==
DstDesc
::
GetNumOfDimension
()
&&
nDim
==
SliceLengths
::
GetSize
()
&&
nDim
==
SubLengths
::
GetSize
()
&&
nDim
==
DataClusterLengths
::
GetSize
()
&&
nDim
==
SubLengths
::
GetSize
()
&&
nDim
==
DataClusterLengths
::
GetSize
()
&&
...
@@ -55,7 +56,7 @@ struct BlockwiseGenericTensorSliceCopy_v1
...
@@ -55,7 +56,7 @@ struct BlockwiseGenericTensorSliceCopy_v1
nDim
==
SrcAccessOrder
::
GetSize
()
&&
nDim
==
DstAccessOrder
::
GetSize
(),
nDim
==
SrcAccessOrder
::
GetSize
()
&&
nDim
==
DstAccessOrder
::
GetSize
(),
"wrong"
);
"wrong"
);
// check
// check
thread arrange order and read/write access order are valid
static_assert
(
is_valid_sequence_map
<
ThreadClusterArrangeOrder
>::
value
&&
static_assert
(
is_valid_sequence_map
<
ThreadClusterArrangeOrder
>::
value
&&
is_valid_sequence_map
<
SrcAccessOrder
>::
value
&&
is_valid_sequence_map
<
SrcAccessOrder
>::
value
&&
is_valid_sequence_map
<
DstAccessOrder
>::
value
,
is_valid_sequence_map
<
DstAccessOrder
>::
value
,
...
@@ -140,10 +141,14 @@ struct BlockwiseGenericTensorSliceCopy_v1
...
@@ -140,10 +141,14 @@ struct BlockwiseGenericTensorSliceCopy_v1
});
});
// complete offset
// complete offset
mThreadSrcOffset
=
reduce_on_array
(
mThreadSrcPartialOffsets
,
std
::
plus
<
index_t
>
{});
mThreadSrcOffset
=
accumulate_on_array
(
mThreadDstOffset
=
reduce_on_array
(
mThreadDstPartialOffsets
,
std
::
plus
<
index_t
>
{});
mThreadSrcPartialOffsets
,
mod_conv
::
plus
<
index_t
>
{},
static_cast
<
index_t
>
(
0
));
mThreadDstOffset
=
accumulate_on_array
(
mThreadDstPartialOffsets
,
mod_conv
::
plus
<
index_t
>
{},
static_cast
<
index_t
>
(
0
));
#if 0
#if 0
if(get_block_1d_id() == 0)
{
{
printf("id %5u %5u: "
printf("id %5u %5u: "
"src_block_data_multi_id_begin: %u %u %u %u, "
"src_block_data_multi_id_begin: %u %u %u %u, "
...
@@ -279,13 +284,9 @@ struct BlockwiseGenericTensorSliceCopy_v1
...
@@ -279,13 +284,9 @@ struct BlockwiseGenericTensorSliceCopy_v1
// the boundary of the tensor being sliced. This functions doesn't do runtime sanity
// the boundary of the tensor being sliced. This functions doesn't do runtime sanity
// check on out-of-bound slicing window, for performance reason
// check on out-of-bound slicing window, for performance reason
template
<
index_t
IDim_
,
index_t
StepSize
,
bool
PositiveDirection
>
template
<
index_t
IDim_
,
index_t
StepSize
,
bool
PositiveDirection
>
__device__
void
MoveSlicingWindowOnSourceTensor
(
Number
<
IDim_
>
,
__device__
void
MoveSlicingWindowOnSourceTensor
(
Number
<
StepSize
>
,
Number
<
IDim_
>
,
Number
<
StepSize
>
,
integral_constant
<
bool
,
PositiveDirection
>
direction
)
integral_constant
<
bool
,
PositiveDirection
>
)
{
{
static_assert
(
PositiveDirection
,
"wrong! only support movement in positive direction for now"
);
constexpr
auto
IDim
=
Number
<
IDim_
>
{};
constexpr
auto
IDim
=
Number
<
IDim_
>
{};
constexpr
index_t
idim
=
IDim
.
Get
();
constexpr
index_t
idim
=
IDim
.
Get
();
...
@@ -306,7 +307,7 @@ struct BlockwiseGenericTensorSliceCopy_v1
...
@@ -306,7 +307,7 @@ struct BlockwiseGenericTensorSliceCopy_v1
auto
new_src_partial_original_multi_id
=
auto
new_src_partial_original_multi_id
=
src_partial_original_desc
.
UpdateMultiIndexGivenStepSizeOf1dIndex
(
src_partial_original_desc
.
UpdateMultiIndexGivenStepSizeOf1dIndex
(
old_src_partial_original_multi_id
,
StepSize
);
old_src_partial_original_multi_id
,
StepSize
,
direction
);
// update "mThreadSrcOriginalMultiId"
// update "mThreadSrcOriginalMultiId"
static_for
<
0
,
src_partial_original_dims
.
GetSize
(),
1
>
{}([
&
](
auto
I_
)
{
static_for
<
0
,
src_partial_original_dims
.
GetSize
(),
1
>
{}([
&
](
auto
I_
)
{
...
@@ -328,7 +329,7 @@ struct BlockwiseGenericTensorSliceCopy_v1
...
@@ -328,7 +329,7 @@ struct BlockwiseGenericTensorSliceCopy_v1
mThreadSrcPartialOffsets
[
idim
]
=
new_src_partial_offset
;
mThreadSrcPartialOffsets
[
idim
]
=
new_src_partial_offset
;
// update "mThreadSrcOffset", do "+" before "-" to avoid underflow
// update "mThreadSrcOffset", do "+" before "-" to avoid underflow
mThreadSrcOffset
=
mThreadSrcOffset
+
new_src_partial_offset
-
old_src_partial_offset
;
mThreadSrcOffset
=
(
mThreadSrcOffset
+
new_src_partial_offset
)
-
old_src_partial_offset
;
}).
Else
([
&
](
auto
fwd
)
{
}).
Else
([
&
](
auto
fwd
)
{
// Logic for non-merged dimension. If you are never going to move the slicing window on
// Logic for non-merged dimension. If you are never going to move the slicing window on
// a merged dimension, then "mThreadSrcOriginalMultiId" and "mThreadSrcPartialOffsets",
// a merged dimension, then "mThreadSrcOriginalMultiId" and "mThreadSrcPartialOffsets",
...
@@ -336,13 +337,25 @@ struct BlockwiseGenericTensorSliceCopy_v1
...
@@ -336,13 +337,25 @@ struct BlockwiseGenericTensorSliceCopy_v1
// should be able to remove these calculations.
// should be able to remove these calculations.
// TODO: make sure compiler would actually remove them in this case.
// TODO: make sure compiler would actually remove them in this case.
// It is the user's responsiblity to make sure the slicing window will not be moved out
// of the boundary of the tensor being sliced. Otherwise, there might be hazard like
// unsigned integer underflow. That is NO runtime sanity check to prevent the hazard
constexpr
index_t
idim_original
=
SrcDesc
::
GetContainedOriginalDimensions
(
IDim
).
Front
();
constexpr
index_t
idim_original
=
SrcDesc
::
GetContainedOriginalDimensions
(
IDim
).
Front
();
static_if
<
PositiveDirection
>
{}([
&
](
auto
)
{
mThreadSrcOffset
+=
StepSize
*
SrcDesc
::
GetStride
(
IDim
);
mThreadSrcOffset
+=
StepSize
*
SrcDesc
::
GetStride
(
IDim
);
mThreadSrcOriginalMultiId
[
idim_original
]
+=
StepSize
;
mThreadSrcOriginalMultiId
[
idim_original
]
+=
StepSize
;
mThreadSrcPartialOffsets
[
idim
]
+=
StepSize
*
SrcDesc
::
GetStride
(
IDim
);
mThreadSrcPartialOffsets
[
idim
]
+=
StepSize
*
SrcDesc
::
GetStride
(
IDim
);
}).
Else
([
&
](
auto
)
{
mThreadSrcOffset
-=
StepSize
*
SrcDesc
::
GetStride
(
IDim
);
mThreadSrcOriginalMultiId
[
idim_original
]
-=
StepSize
;
mThreadSrcPartialOffsets
[
idim
]
-=
StepSize
*
SrcDesc
::
GetStride
(
IDim
);
});
});
});
}
}
};
};
src/include/common.hip.hpp
View file @
b2439ec9
...
@@ -39,6 +39,18 @@ struct scales
...
@@ -39,6 +39,18 @@ struct scales
__host__
__device__
constexpr
T
operator
()(
T
a
)
const
{
return
s
*
a
;
}
__host__
__device__
constexpr
T
operator
()(
T
a
)
const
{
return
s
*
a
;
}
};
};
template
<
class
T
>
struct
plus
{
__host__
__device__
constexpr
T
operator
()(
T
a
,
T
b
)
const
{
return
a
+
b
;
}
};
template
<
class
T
>
struct
multiplies
{
__host__
__device__
constexpr
T
operator
()(
T
a
,
T
b
)
const
{
return
a
*
b
;
}
};
template
<
class
T
>
template
<
class
T
>
struct
integer_divide_ceiler
struct
integer_divide_ceiler
{
{
...
...
src/include/gridwise_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hip.hpp
View file @
b2439ec9
...
@@ -58,6 +58,9 @@ struct GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw
...
@@ -58,6 +58,9 @@ struct GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw
constexpr
auto
I6
=
Number
<
6
>
{};
constexpr
auto
I6
=
Number
<
6
>
{};
constexpr
auto
I7
=
Number
<
7
>
{};
constexpr
auto
I7
=
Number
<
7
>
{};
constexpr
auto
True
=
integral_constant
<
bool
,
true
>
{};
constexpr
auto
False
=
integral_constant
<
bool
,
false
>
{};
constexpr
auto
in_n_c_h_w_global_desc
=
InGlobalDesc
{};
constexpr
auto
in_n_c_h_w_global_desc
=
InGlobalDesc
{};
constexpr
auto
wei_c_y_x_k_global_desc
=
WeiGlobalDesc
{};
constexpr
auto
wei_c_y_x_k_global_desc
=
WeiGlobalDesc
{};
constexpr
auto
out_n_k_h_w_global_desc
=
OutGlobalDesc
{};
constexpr
auto
out_n_k_h_w_global_desc
=
OutGlobalDesc
{};
...
@@ -123,7 +126,7 @@ struct GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw
...
@@ -123,7 +126,7 @@ struct GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw
// input blockwise copy
// input blockwise copy
// slice a merged tensor, reorder and copy to a normal tensor
// slice a merged tensor, reorder and copy to a normal tensor
// this copy operator already has blockwise offset built-in
// this copy operator already has blockwise offset built-in
const
auto
blockwise_in_copy
=
BlockwiseGenericTensorSliceCopy_v1
<
auto
blockwise_in_copy
=
BlockwiseGenericTensorSliceCopy_v1
<
BlockSize
,
BlockSize
,
Float
,
Float
,
decltype
(
in_c_n1_b_n2_global_merged_desc
),
decltype
(
in_c_n1_b_n2_global_merged_desc
),
...
@@ -150,8 +153,8 @@ struct GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw
...
@@ -150,8 +153,8 @@ struct GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw
// operator for blockwise copy of weight into LDS
// operator for blockwise copy of weight into LDS
// slice a tensor, and copy it into another tensor
// slice a tensor, and copy it into another tensor
// this copy operator already have blockwise offset built-in
// this copy operator already have blockwise offset built-in
const
auto
blockwise_wei_copy
=
auto
blockwise_wei_copy
=
#if
0
#if
1
BlockwiseGenericTensorSliceCopy_v1
<
BlockSize
,
BlockwiseGenericTensorSliceCopy_v1
<
BlockSize
,
Float
,
Float
,
decltype
(
wei_c_k_global_desc
),
decltype
(
wei_c_k_global_desc
),
...
@@ -181,10 +184,8 @@ struct GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw
...
@@ -181,10 +184,8 @@ struct GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw
// b_mtx[CPerBlocl, N1 * BPerBlock * N2] is in LDS
// b_mtx[CPerBlocl, N1 * BPerBlock * N2] is in LDS
// c_mtx[KPerBlock, N1 * BPerBlock * N2] is distributed among threads, and saved in
// c_mtx[KPerBlock, N1 * BPerBlock * N2] is distributed among threads, and saved in
// register
// register
constexpr
auto
a_c_k_block_mtx_desc
=
constexpr
auto
a_c_k_block_mtx_desc
=
make_ConstantMatrixDescriptor
(
make_ConstantMatrixDescriptor
(
Number
<
CPerBlock
>
{},
Number
<
CPerBlock
>
{},
Number
<
KPerBlock
>
{},
Number
<
wei_c_k_block_desc
.
GetStride
(
I0
)
>
{});
Number
<
KPerBlock
>
{},
Number
<
wei_c_k_block_desc
.
GetStride
(
I0
)
>
{});
constexpr
auto
b_c_n1bn2_block_mtx_desc
=
constexpr
auto
b_c_n1bn2_block_mtx_desc
=
make_ConstantMatrixDescriptor
(
Number
<
CPerBlock
>
{},
make_ConstantMatrixDescriptor
(
Number
<
CPerBlock
>
{},
...
@@ -239,6 +240,7 @@ struct GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw
...
@@ -239,6 +240,7 @@ struct GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw
// zero out threadwise output
// zero out threadwise output
threadwise_matrix_set_zero
(
c_k0k2_n1n2_thread_mtx_desc
,
p_out_thread
);
threadwise_matrix_set_zero
(
c_k0k2_n1n2_thread_mtx_desc
,
p_out_thread
);
#if 0
// do work
// do work
for(index_t y = 0; y < Y; ++y)
for(index_t y = 0; y < Y; ++y)
{
{
...
@@ -269,6 +271,45 @@ struct GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw
...
@@ -269,6 +271,45 @@ struct GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw
}
}
}
}
}
}
#else
for
(
index_t
y
=
0
;
y
<
Y
;
++
y
)
{
for
(
index_t
x
=
0
;
x
<
X
;
++
x
)
{
// calculate origin of block input and weight tensor on global memory
const
Float
*
p_in_block_on_global
=
p_in_global
+
in_n_c_h_w_global_desc
.
GetOffsetFromMultiIndex
(
0
,
0
,
y
,
x
);
const
Float
*
p_wei_block_on_global
=
p_wei_global
+
wei_c_y_x_k_global_desc
.
GetOffsetFromMultiIndex
(
0
,
y
,
x
,
0
);
for
(
index_t
c_block_data_on_global
=
0
;
c_block_data_on_global
<
C
;
c_block_data_on_global
+=
CPerBlock
)
{
blockwise_in_copy
.
Run
(
p_in_block_on_global
,
p_in_block
);
blockwise_wei_copy
.
Run
(
p_wei_block_on_global
,
p_wei_block
);
__syncthreads
();
blockwise_gemm
.
Run
(
p_wei_block
,
p_in_block
,
p_out_thread
);
__syncthreads
();
// move on C: C_N1_B_N2, C_K
blockwise_in_copy
.
MoveSlicingWindowOnSourceTensor
(
I0
,
Number
<
CPerBlock
>
{},
True
);
blockwise_wei_copy
.
MoveSlicingWindowOnSourceTensor
(
I0
,
Number
<
CPerBlock
>
{},
True
);
}
// reset C
blockwise_in_copy
.
MoveSlicingWindowOnSourceTensor
(
I0
,
Number
<
C
>
{},
False
);
blockwise_wei_copy
.
MoveSlicingWindowOnSourceTensor
(
I0
,
Number
<
C
>
{},
False
);
}
}
#endif
// copy output: register to global memory
// copy output: register to global memory
{
{
...
...
src/include/gridwise_convolution_implicit_gemm_v4_nchw_kcyx_nkhw.hip.hpp
View file @
b2439ec9
...
@@ -59,7 +59,7 @@ struct GridwiseConvolutionImplicitGemm_v4_nchw_kcyx_nkhw
...
@@ -59,7 +59,7 @@ struct GridwiseConvolutionImplicitGemm_v4_nchw_kcyx_nkhw
constexpr
auto
I6
=
Number
<
6
>
{};
constexpr
auto
I6
=
Number
<
6
>
{};
constexpr
auto
I7
=
Number
<
7
>
{};
constexpr
auto
I7
=
Number
<
7
>
{};
constexpr
auto
T
RUE
=
integral_constant
<
bool
,
true
>
{};
constexpr
auto
T
rue
=
integral_constant
<
bool
,
true
>
{};
constexpr
auto
in_n_c_h_w_global_desc
=
InGlobalDesc
{};
constexpr
auto
in_n_c_h_w_global_desc
=
InGlobalDesc
{};
constexpr
auto
wei_k_c_y_x_global_desc
=
WeiGlobalDesc
{};
constexpr
auto
wei_k_c_y_x_global_desc
=
WeiGlobalDesc
{};
...
@@ -102,9 +102,9 @@ struct GridwiseConvolutionImplicitGemm_v4_nchw_kcyx_nkhw
...
@@ -102,9 +102,9 @@ struct GridwiseConvolutionImplicitGemm_v4_nchw_kcyx_nkhw
const
index_t
b_block_data_on_global
=
block_work_multi_id
[
1
]
*
BPerBlock
;
const
index_t
b_block_data_on_global
=
block_work_multi_id
[
1
]
*
BPerBlock
;
// input tensor
// input tensor
// tensor descriptor in device memory [N0, N1, N2, H, W]
// tensor descriptor in device memory [N0, N1, N2, H
o
, W
o
]
constexpr
auto
in_n0_n1_n2_h_w_global_desc
=
in_n_c_h_w_global_desc
.
Slice
(
I2
,
Number
<
H
i
>
{})
constexpr
auto
in_n0_n1_n2_h_w_global_desc
=
in_n_c_h_w_global_desc
.
Slice
(
I2
,
Number
<
H
o
>
{})
.
Slice
(
I3
,
Number
<
W
i
>
{})
.
Slice
(
I3
,
Number
<
W
o
>
{})
.
Fold
(
I0
,
Number
<
N1
>
{},
Number
<
N2
>
{})
.
Fold
(
I0
,
Number
<
N1
>
{},
Number
<
N2
>
{})
.
Extract
(
Sequence
<
0
,
1
,
2
,
4
,
5
>
{});
.
Extract
(
Sequence
<
0
,
1
,
2
,
4
,
5
>
{});
...
@@ -115,12 +115,23 @@ struct GridwiseConvolutionImplicitGemm_v4_nchw_kcyx_nkhw
...
@@ -115,12 +115,23 @@ struct GridwiseConvolutionImplicitGemm_v4_nchw_kcyx_nkhw
// merged tensor descriptor in device memory [E, N1, B, N2], src of blockwise copy
// merged tensor descriptor in device memory [E, N1, B, N2], src of blockwise copy
constexpr
auto
in_e_n1_b_n2_global_merged_desc
=
make_ConstantMergedTensorDescriptor
(
constexpr
auto
in_e_n1_b_n2_global_merged_desc
=
make_ConstantMergedTensorDescriptor
(
in_c_y_x_global_desc
.
Inject
(
in_n0_n1_n2_h_w_global_desc
),
in_c_y_x_global_desc
.
Embed
(
in_n0_n1_n2_h_w_global_desc
),
Sequence
<
0
,
1
,
2
>
{},
Sequence
<
0
,
1
,
2
>
{},
Sequence
<
4
>
{},
Sequence
<
4
>
{},
Sequence
<
3
,
6
,
7
>
{},
Sequence
<
3
,
6
,
7
>
{},
Sequence
<
5
>
{});
Sequence
<
5
>
{});
#if 0
if(get_block_1d_id() == 0 && get_thread_local_1d_id() == 0)
{
print_ConstantTensorDescriptor(in_n0_n1_n2_h_w_global_desc,
"in_n0_n1_n2_h_w_global_desc: ");
print_ConstantTensorDescriptor(in_c_y_x_global_desc, "in_c_y_x_global_desc: ");
print_ConstantMergedTensorDescriptor(in_e_n1_b_n2_global_merged_desc,
"in_e_n1_b_n2_global_merged_desc: ");
}
#endif
// memory layout descriptor in LDS [E, N1, B, N2], dst of blockwise copy
// memory layout descriptor in LDS [E, N1, B, N2], dst of blockwise copy
// be careful of LDS alignment
// be careful of LDS alignment
constexpr
auto
in_e_n1_b_n2_block_desc
=
make_ConstantTensorDescriptor_default_rank_aligned
(
constexpr
auto
in_e_n1_b_n2_block_desc
=
make_ConstantTensorDescriptor_default_rank_aligned
(
...
@@ -243,6 +254,31 @@ struct GridwiseConvolutionImplicitGemm_v4_nchw_kcyx_nkhw
...
@@ -243,6 +254,31 @@ struct GridwiseConvolutionImplicitGemm_v4_nchw_kcyx_nkhw
// do work
// do work
for
(
index_t
e
=
0
;
e
<
E
;
e
+=
EPerBlock
)
for
(
index_t
e
=
0
;
e
<
E
;
e
+=
EPerBlock
)
{
{
#if 0
if(e == 1 * EPerBlock && get_block_1d_id() == 0)
{
printf("id %5u %5u: "
"mThreadSrcOriginalMultiId %u %u %u %u %u %u %u %u, "
"mThreadSrcPartialOffsets %u %u %u %u, "
"mThreadSrcOffset %u, mThreadDstOffset %u \n",
get_block_1d_id(),
get_thread_local_1d_id(),
blockwise_in_copy.mThreadSrcOriginalMultiId[0],
blockwise_in_copy.mThreadSrcOriginalMultiId[1],
blockwise_in_copy.mThreadSrcOriginalMultiId[2],
blockwise_in_copy.mThreadSrcOriginalMultiId[3],
blockwise_in_copy.mThreadSrcOriginalMultiId[4],
blockwise_in_copy.mThreadSrcOriginalMultiId[5],
blockwise_in_copy.mThreadSrcOriginalMultiId[6],
blockwise_in_copy.mThreadSrcOriginalMultiId[7],
blockwise_in_copy.mThreadSrcPartialOffsets[0],
blockwise_in_copy.mThreadSrcPartialOffsets[1],
blockwise_in_copy.mThreadSrcPartialOffsets[2],
blockwise_in_copy.mThreadSrcPartialOffsets[3],
blockwise_in_copy.mThreadSrcOffset,
blockwise_in_copy.mThreadDstOffset);
}
#endif
// marching slicing window
// marching slicing window
blockwise_in_copy
.
Run
(
p_in_global
,
p_in_block
);
blockwise_in_copy
.
Run
(
p_in_global
,
p_in_block
);
blockwise_wei_copy
.
Run
(
p_wei_global
,
p_wei_block
);
blockwise_wei_copy
.
Run
(
p_wei_global
,
p_wei_block
);
...
@@ -253,8 +289,8 @@ struct GridwiseConvolutionImplicitGemm_v4_nchw_kcyx_nkhw
...
@@ -253,8 +289,8 @@ struct GridwiseConvolutionImplicitGemm_v4_nchw_kcyx_nkhw
__syncthreads
();
__syncthreads
();
blockwise_in_copy
.
MoveSlicingWindowOnSourceTensor
(
I0
,
Number
<
EPerBlock
>
{},
T
RUE
);
blockwise_in_copy
.
MoveSlicingWindowOnSourceTensor
(
I0
,
Number
<
EPerBlock
>
{},
T
rue
);
blockwise_wei_copy
.
MoveSlicingWindowOnSourceTensor
(
I0
,
Number
<
EPerBlock
>
{},
T
RUE
);
blockwise_wei_copy
.
MoveSlicingWindowOnSourceTensor
(
I0
,
Number
<
EPerBlock
>
{},
T
rue
);
}
}
// copy output: register to global memory
// copy output: register to global memory
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment