Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
7a89684f
Commit
7a89684f
authored
Jun 06, 2019
by
Chao Liu
Browse files
refactor
parent
eafdabba
Changes
26
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
210 additions
and
381 deletions
+210
-381
driver/driver.hip.cpp
driver/driver.hip.cpp
+2
-2
src/include/Array.hip.hpp
src/include/Array.hip.hpp
+31
-49
src/include/ConstantMergedTensorDescriptor.hip.hpp
src/include/ConstantMergedTensorDescriptor.hip.hpp
+26
-52
src/include/ConstantTensorDescriptor.hip.hpp
src/include/ConstantTensorDescriptor.hip.hpp
+73
-124
src/include/Sequence.hip.hpp
src/include/Sequence.hip.hpp
+20
-41
src/include/blockwise_generic_tensor_slice_op.hip.hpp
src/include/blockwise_generic_tensor_slice_op.hip.hpp
+8
-8
src/include/common.hip.hpp
src/include/common.hip.hpp
+1
-1
src/include/functional.hip.hpp
src/include/functional.hip.hpp
+1
-37
src/include/functional2.hip.hpp
src/include/functional2.hip.hpp
+28
-47
src/include/gridwise_convolution_implicit_gemm_v1r1_chwn_cyxk_khwn.hip.hpp
...ise_convolution_implicit_gemm_v1r1_chwn_cyxk_khwn.hip.hpp
+3
-3
src/include/gridwise_convolution_implicit_gemm_v1r2_chwn_cyxk_khwn.hip.hpp
...ise_convolution_implicit_gemm_v1r2_chwn_cyxk_khwn.hip.hpp
+2
-2
src/include/gridwise_convolution_implicit_gemm_v1r2_nchw_cyxk_khwn.hip.hpp
...ise_convolution_implicit_gemm_v1r2_nchw_cyxk_khwn.hip.hpp
+1
-1
src/include/gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn.hip.hpp
...ise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn.hip.hpp
+2
-2
src/include/gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_chwn_cyxk_khwn.hip.hpp
...plicit_gemm_v1r3_lds_double_buffer_chwn_cyxk_khwn.hip.hpp
+2
-2
src/include/gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_nchw_cyxk_khwn.hip.hpp
...plicit_gemm_v1r3_lds_double_buffer_nchw_cyxk_khwn.hip.hpp
+2
-2
src/include/gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_nchw_cyxk_nkhw.hip.hpp
...plicit_gemm_v1r3_lds_double_buffer_nchw_cyxk_nkhw.hip.hpp
+2
-2
src/include/gridwise_convolution_implicit_gemm_v1r3_nchw_cyxk_khwn.hip.hpp
...ise_convolution_implicit_gemm_v1r3_nchw_cyxk_khwn.hip.hpp
+2
-2
src/include/gridwise_convolution_implicit_gemm_v1r3_nchw_cyxk_nkhw.hip.hpp
...ise_convolution_implicit_gemm_v1r3_nchw_cyxk_nkhw.hip.hpp
+2
-2
src/include/gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hip.hpp
...dwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hip.hpp
+1
-1
src/include/gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn_lds_double_buffer.hip.hpp
...implicit_gemm_v2_chwn_cyxk_khwn_lds_double_buffer.hip.hpp
+1
-1
No files found.
driver/driver.hip.cpp
View file @
7a89684f
...
...
@@ -646,9 +646,9 @@ int main(int argc, char* argv[])
device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw
#elif 0
device_convolution_implicit_gemm_v2_chwn_cyxk_khwn
#elif 0
device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw
#elif 1
device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw
#elif 0
device_convolution_implicit_gemm_v4_nchw_kcyx_nkhw
#endif
(
in_nchw_desc
,
in_nchw
,
wei_kcyx_desc
,
wei_kcyx
,
out_nkhw_desc
,
out_nkhw_device
,
nrepeat
);
...
...
src/include/Array.hip.hpp
View file @
7a89684f
...
...
@@ -18,9 +18,21 @@ struct Array
__host__
__device__
constexpr
index_t
GetSize
()
const
{
return
NSize
;
}
template
<
index_t
I
>
__host__
__device__
constexpr
TData
operator
[](
Number
<
I
>
)
const
{
return
mData
[
I
];
}
__host__
__device__
constexpr
TData
operator
[](
index_t
i
)
const
{
return
mData
[
i
];
}
__host__
__device__
TData
&
operator
[](
index_t
i
)
{
return
mData
[
i
];
}
template
<
index_t
I
>
__host__
__device__
TData
&
operator
()(
Number
<
I
>
)
{
return
mData
[
I
];
}
__host__
__device__
TData
&
operator
()(
index_t
i
)
{
return
mData
[
i
];
}
template
<
index_t
I
>
__host__
__device__
constexpr
TData
Get
(
Number
<
I
>
)
const
...
...
@@ -44,10 +56,10 @@ struct Array
static_for
<
0
,
NSize
,
1
>
{}([
&
](
auto
I
)
{
constexpr
index_t
i
=
I
.
Get
();
new_array
[
i
]
=
mData
[
i
];
new_array
(
i
)
=
mData
[
i
];
});
new_array
[
NSize
]
=
x
;
new_array
(
NSize
)
=
x
;
return
new_array
;
}
...
...
@@ -62,20 +74,9 @@ __host__ __device__ constexpr auto sequence2array(Sequence<Is...>)
template
<
class
TData
,
index_t
NSize
>
__host__
__device__
constexpr
auto
make_zero_array
()
{
#if 0
Array<TData, NSize> a;
static_for<0, NSize, 1>{}([&](auto I) {
constexpr index_t i = I.Get();
a[i] = static_cast<TData>(0);
});
return a;
#else
constexpr
auto
zero_sequence
=
typename
uniform_sequence_gen
<
NSize
,
0
>::
SeqType
{};
constexpr
auto
zero_array
=
sequence2array
(
zero_sequence
);
return
zero_array
;
#endif
}
template
<
class
TData
,
index_t
NSize
,
index_t
...
IRs
>
...
...
@@ -94,44 +95,26 @@ __host__ __device__ constexpr auto reorder_array_given_new2old(const Array<TData
return
new_array
;
}
#if 0
template <class TData, index_t NSize, index_t... IRs>
__host__ __device__ constexpr auto reorder_array_given_old2new(const Array<TData, NSize>& old_array,
Sequence<IRs...> old2new)
{
Array<TData, NSize> new_array;
static_assert(NSize == sizeof...(IRs), "NSize not consistent");
static_for<0, NSize, 1>{}([&](auto IDim) {
constexpr index_t idim = IDim.Get();
new_array[old2new.Get(IDim)] = old_array[idim];
});
return new_array;
}
#else
template
<
class
TData
,
index_t
NSize
,
class
MapOld2New
>
struct
reorder_array_given_old2new
_impl
struct
lambda_
reorder_array_given_old2new
{
const
Array
<
TData
,
NSize
>&
old_array
_ref
;
Array
<
TData
,
NSize
>&
new_array
_ref
;
const
Array
<
TData
,
NSize
>&
old_array
;
Array
<
TData
,
NSize
>&
new_array
;
__host__
__device__
constexpr
reorder_array_given_old2new_impl
(
const
Array
<
TData
,
NSize
>&
old_array
,
Array
<
TData
,
NSize
>&
new_array
)
:
old_array_ref
(
old_array
),
new_array_ref
(
new_array
)
__host__
__device__
constexpr
lambda_reorder_array_given_old2new
(
const
Array
<
TData
,
NSize
>&
old_array_
,
Array
<
TData
,
NSize
>&
new_array_
)
:
old_array
(
old_array_
),
new_array
(
new_array_
)
{
}
template
<
index_t
IOldDim
>
__host__
__device__
constexpr
void
operator
()(
Number
<
IOldDim
>
)
const
{
TData
old_data
=
old_array
_ref
.
Get
(
Number
<
IOldDim
>
{})
;
TData
old_data
=
old_array
[
IOldDim
]
;
constexpr
index_t
INewDim
=
MapOld2New
::
Get
(
Number
<
IOldDim
>
{});
new_array
_ref
.
Set
(
Number
<
INewDim
>
{},
old_data
);
new_array
.
Set
(
Number
<
INewDim
>
{},
old_data
);
}
};
...
...
@@ -144,11 +127,10 @@ __host__ __device__ constexpr auto reorder_array_given_old2new(const Array<TData
static_assert
(
NSize
==
sizeof
...(
IRs
),
"NSize not consistent"
);
static_for
<
0
,
NSize
,
1
>
{}(
reorder_array_given_old2new
_impl
<
TData
,
NSize
,
Sequence
<
IRs
...
>>
(
old_array
,
new_array
));
lambda_
reorder_array_given_old2new
<
TData
,
NSize
,
Sequence
<
IRs
...
>>
(
old_array
,
new_array
));
return
new_array
;
}
#endif
template
<
class
TData
,
index_t
NSize
,
class
ExtractSeq
>
__host__
__device__
constexpr
auto
extract_array
(
const
Array
<
TData
,
NSize
>&
old_array
,
ExtractSeq
)
...
...
@@ -161,7 +143,7 @@ __host__ __device__ constexpr auto extract_array(const Array<TData, NSize>& old_
static_for
<
0
,
new_size
,
1
>
{}([
&
](
auto
I
)
{
constexpr
index_t
i
=
I
.
Get
();
new_array
[
i
]
=
old_array
[
ExtractSeq
::
Get
(
I
)];
new_array
(
i
)
=
old_array
[
ExtractSeq
::
Get
(
I
)];
});
return
new_array
;
...
...
@@ -176,7 +158,7 @@ __host__ __device__ constexpr auto operator+(Array<TData, NSize> a, Array<TData,
static_for
<
0
,
NSize
,
1
>
{}([
&
](
auto
I
)
{
constexpr
index_t
i
=
I
.
Get
();
result
[
i
]
=
a
[
i
]
+
b
[
i
];
result
(
i
)
=
a
[
i
]
+
b
[
i
];
});
return
result
;
...
...
@@ -191,7 +173,7 @@ __host__ __device__ constexpr auto operator-(Array<TData, NSize> a, Array<TData,
static_for
<
0
,
NSize
,
1
>
{}([
&
](
auto
I
)
{
constexpr
index_t
i
=
I
.
Get
();
result
[
i
]
=
a
[
i
]
-
b
[
i
];
result
(
i
)
=
a
[
i
]
-
b
[
i
];
});
return
result
;
...
...
@@ -208,7 +190,7 @@ __host__ __device__ constexpr auto operator+(Array<TData, NSize> a, Sequence<Is.
static_for
<
0
,
NSize
,
1
>
{}([
&
](
auto
I
)
{
constexpr
index_t
i
=
I
.
Get
();
result
[
i
]
=
a
[
i
]
+
b
.
Get
(
I
);
result
(
i
)
=
a
[
i
]
+
b
.
Get
(
I
);
});
return
result
;
...
...
@@ -225,7 +207,7 @@ __host__ __device__ constexpr auto operator-(Array<TData, NSize> a, Sequence<Is.
static_for
<
0
,
NSize
,
1
>
{}([
&
](
auto
I
)
{
constexpr
index_t
i
=
I
.
Get
();
result
[
i
]
=
a
[
i
]
-
b
.
Get
(
I
);
result
(
i
)
=
a
[
i
]
-
b
.
Get
(
I
);
});
return
result
;
...
...
@@ -242,7 +224,7 @@ __host__ __device__ constexpr auto operator*(Array<TData, NSize> a, Sequence<Is.
static_for
<
0
,
NSize
,
1
>
{}([
&
](
auto
I
)
{
constexpr
index_t
i
=
I
.
Get
();
result
[
i
]
=
a
[
i
]
*
b
.
Get
(
I
);
result
(
i
)
=
a
[
i
]
*
b
.
Get
(
I
);
});
return
result
;
...
...
@@ -259,7 +241,7 @@ __host__ __device__ constexpr auto operator-(Sequence<Is...> a, Array<TData, NSi
static_for
<
0
,
NSize
,
1
>
{}([
&
](
auto
I
)
{
constexpr
index_t
i
=
I
.
Get
();
result
[
i
]
=
a
.
Get
(
I
)
-
b
[
i
];
result
(
i
)
=
a
.
Get
(
I
)
-
b
[
i
];
});
return
result
;
...
...
src/include/ConstantMergedTensorDescriptor.hip.hpp
View file @
7a89684f
...
...
@@ -9,6 +9,8 @@
template
<
class
OriginalTensorDesc
,
class
...
OriginalDimMergeSeqs
>
struct
ConstantMergedTensorDescriptor
{
using
Type
=
ConstantMergedTensorDescriptor
;
static
constexpr
auto
mOriginalDimMergeSeqs
=
std
::
tuple
<
OriginalDimMergeSeqs
...
>
{};
static
constexpr
index_t
nDim
=
sizeof
...(
OriginalDimMergeSeqs
);
...
...
@@ -74,43 +76,17 @@ struct ConstantMergedTensorDescriptor
return
OriginalTensorDesc
::
GetElementSize
();
}
#if 0
__host__ __device__ static constexpr auto
GetOriginalMultiIndexFromMultiIndex(Array<index_t, nDim> multi_id)
{
Array<index_t, nOriginalDim> original_multi_id;
static_for<0, nDim, 1>{}([&](auto IDim) {
constexpr index_t idim = IDim.Get();
constexpr auto original_dims_partial = std::get<idim>(mOriginalDimMergeSeqs);
// get partial original-multi-id corresponding to this merged dimension
const auto original_multi_id_partial =
OriginalTensorDesc::Extract(original_dims_partial)
.GetMultiIndexFrom1dIndex(multi_id[idim]);
static_for<0, original_dims_partial.GetSize(), 1>{}([&](auto I_) {
constexpr auto I = decltype(I_){};
constexpr index_t idim_original = original_dims_partial.Get(I);
original_multi_id[idim_original] = original_multi_id_partial[I.Get()];
});
});
return original_multi_id;
}
#else
template
<
class
OriginalDimsPartial
>
struct
GetOriginalMultiIndexFromMultiIndex
_impl1
struct
lambda_1_
GetOriginalMultiIndexFromMultiIndex
{
const
Array
<
index_t
,
OriginalDimsPartial
::
GetSize
()
>&
original_multi_id_partial
_ref
;
Array
<
index_t
,
nOriginalDim
>&
original_multi_id
_ref
;
__host__
__device__
constexpr
GetOriginalMultiIndexFromMultiIndex
_impl1
(
const
Array
<
index_t
,
OriginalDimsPartial
::
GetSize
()
>&
original_multi_id_partial
,
Array
<
index_t
,
nOriginalDim
>&
original_multi_id
)
:
original_multi_id_partial
_ref
(
original_multi_id_partial
),
original_multi_id
_ref
(
original_multi_id
)
const
Array
<
index_t
,
OriginalDimsPartial
::
GetSize
()
>&
original_multi_id_partial
;
Array
<
index_t
,
nOriginalDim
>&
original_multi_id
;
__host__
__device__
constexpr
lambda_1_
GetOriginalMultiIndexFromMultiIndex
(
const
Array
<
index_t
,
OriginalDimsPartial
::
GetSize
()
>&
original_multi_id_partial
_
,
Array
<
index_t
,
nOriginalDim
>&
original_multi_id
_
)
:
original_multi_id_partial
(
original_multi_id_partial
_
),
original_multi_id
(
original_multi_id
_
)
{
}
...
...
@@ -119,37 +95,36 @@ struct ConstantMergedTensorDescriptor
{
constexpr
index_t
idim_original
=
OriginalDimsPartial
::
Get
(
Number
<
I
>
{});
index_t
itmp
=
original_multi_id_partial
_ref
.
Get
(
Number
<
I
>
{})
;
index_t
itmp
=
original_multi_id_partial
[
I
]
;
original_multi_id
_ref
.
Set
(
Number
<
idim_original
>
{},
itmp
);
original_multi_id
.
Set
(
Number
<
idim_original
>
{},
itmp
);
}
};
struct
GetOriginalMultiIndexFromMultiIndex
_impl0
struct
lambda_0_
GetOriginalMultiIndexFromMultiIndex
{
const
Array
<
index_t
,
nDim
>&
multi_id
_ref
;
Array
<
index_t
,
nOriginalDim
>&
original_multi_id
_ref
;
const
Array
<
index_t
,
nDim
>&
multi_id
;
Array
<
index_t
,
nOriginalDim
>&
original_multi_id
;
__host__
__device__
constexpr
GetOriginalMultiIndexFromMultiIndex
_impl0
(
const
Array
<
index_t
,
nDim
>&
multi_id
,
Array
<
index_t
,
nOriginalDim
>&
original_multi_id
)
:
multi_id
_ref
(
multi_id
),
original_multi_id
_ref
(
original_multi_id
)
__host__
__device__
constexpr
lambda_0_
GetOriginalMultiIndexFromMultiIndex
(
const
Array
<
index_t
,
nDim
>&
multi_id
_
,
Array
<
index_t
,
nOriginalDim
>&
original_multi_id
_
)
:
multi_id
(
multi_id
_
),
original_multi_id
(
original_multi_id
_
)
{
}
template
<
index_t
IDim
>
__host__
__device__
constexpr
void
operator
()(
Number
<
IDim
>
)
const
{
constexpr
auto
original_dims_partial
=
std
::
get
<
IDim
>
(
std
::
tuple
<
OriginalDimMergeSeqs
...
>
{});
constexpr
auto
original_dims_partial
=
std
::
get
<
IDim
>
(
Type
::
mOriginalDimMergeSeqs
);
// get partial original-multi-id corresponding to this merged dimension
const
auto
original_multi_id_partial
=
OriginalTensorDesc
::
Extract
(
original_dims_partial
)
.
GetMultiIndexFrom1dIndex
(
multi_id
_ref
[
IDim
]);
.
GetMultiIndexFrom1dIndex
(
multi_id
[
IDim
]);
static_for
<
0
,
original_dims_partial
.
GetSize
(),
1
>
{}(
GetOriginalMultiIndexFromMultiIndex
_impl1
<
decltype
(
original_dims_partial
)
>
(
original_multi_id_partial
,
original_multi_id
_ref
));
lambda_1_
GetOriginalMultiIndexFromMultiIndex
<
decltype
(
original_dims_partial
)
>
(
original_multi_id_partial
,
original_multi_id
));
}
};
...
...
@@ -160,7 +135,7 @@ struct ConstantMergedTensorDescriptor
Array
<
index_t
,
nOriginalDim
>
original_multi_id
;
static_for
<
0
,
nDim
,
1
>
{}(
GetOriginalMultiIndexFromMultiIndex
_impl0
(
multi_id
,
original_multi_id
));
lambda_0_
GetOriginalMultiIndexFromMultiIndex
(
multi_id
,
original_multi_id
));
return
original_multi_id
;
}
...
...
@@ -174,7 +149,6 @@ struct ConstantMergedTensorDescriptor
return
OriginalTensorDesc
::
GetOffsetFromMultiIndex
(
original_multi_id
);
}
#endif
__host__
__device__
static
constexpr
index_t
GetOffsetFromMultiIndex
(
Array
<
index_t
,
nDim
>
multi_id
)
...
...
@@ -192,9 +166,9 @@ struct ConstantMergedTensorDescriptor
__host__
__device__
static
constexpr
Array
<
index_t
,
nDim
>
GetMultiIndexFrom1dIndex
(
index_t
id
)
{
constexpr
auto
dummy
_desc
=
make_ConstantTensorDescriptor_packed
(
GetLengths
());
constexpr
auto
packed
_desc
=
make_ConstantTensorDescriptor_packed
(
GetLengths
());
return
dummy
_desc
.
GetMultiIndexFrom1dIndex
(
id
);
return
packed
_desc
.
GetMultiIndexFrom1dIndex
(
id
);
}
};
...
...
src/include/ConstantTensorDescriptor.hip.hpp
View file @
7a89684f
...
...
@@ -57,17 +57,38 @@ struct ConstantTensorDescriptor
return
Strides
{}.
Get
(
Number
<
I
>
{});
}
__host__
__device__
static
constexpr
bool
AreStridesNonAscending
()
struct
lambda_AreDimensionsContinuous
{
bool
flag
=
true
;
bool
&
is_continuous
;
static_for
<
0
,
nDim
-
1
,
1
>
{}([
&
](
auto
IDim
)
{
constexpr
auto
IDim_p1
=
Number
<
IDim
.
Get
()
+
1
>
{};
__host__
__device__
constexpr
lambda_AreDimensionsContinuous
(
bool
&
is_continuous_
)
:
is_continuous
(
is_continuous_
)
{
}
flag
=
flag
&&
(
GetLength
(
IDim
)
>=
GetLength
(
IDim_p1
));
});
template
<
class
X
>
__host__
__device__
constexpr
void
operator
()(
X
IDim
)
const
{
constexpr
auto
IDim_p1
=
IDim
+
Number
<
1
>
{};
is_continuous
=
is_continuous
&&
(
GetStride
(
IDim
)
>=
GetStride
(
IDim_p1
)
&&
GetStride
(
IDim
)
==
GetStride
(
IDim_p1
)
*
GetLength
(
IDim_p1
));
}
};
return
flag
;
__host__
__device__
static
constexpr
bool
AreDimensionsContinuous
()
{
bool
is_continuous
=
true
;
static_for
<
0
,
nDim
-
1
,
1
>
{}(
lambda_AreDimensionsContinuous
(
is_continuous
));
return
is_continuous
;
}
__host__
__device__
static
constexpr
bool
IsPackedTensor
()
{
return
AreDimensionsContinuous
()
&&
GetStride
(
Number
<
nDim
-
1
>
{})
==
1
;
}
template
<
class
T
>
...
...
@@ -92,40 +113,24 @@ struct ConstantTensorDescriptor
return
align
.
Get
()
*
((
element_space_unaligned
+
align
.
Get
()
-
1
)
/
align
.
Get
());
}
#if 0
// emulate constexpr lambda
template
<
index_t
NSize
>
__host__ __device__ static constexpr index_t
GetOffsetFromMultiIndex(Array<index_t, NSize> multi_id)
{
static_assert(NSize == nDim, "wrong! Dimension not consistent");
index_t offset = 0;
static_for<0, nDim, 1>{}([&](auto IDim) {
constexpr index_t idim = IDim.Get();
offset += multi_id[idim] * GetStride(IDim);
});
return offset;
}
#else
template
<
index_t
NSize
>
struct
GetOffsetFromMultiIndex_impl
struct
lambda_GetOffsetFromMultiIndex
{
Array
<
index_t
,
NSize
>&
multi_id
_ref
;
index_t
&
offset
_ref
;
Array
<
index_t
,
NSize
>&
multi_id
;
index_t
&
offset
;
__host__
__device__
constexpr
GetOffsetFromMultiIndex_impl
(
Array
<
index_t
,
NSize
>&
multi_id
,
index_t
&
offset
)
:
multi_id_ref
(
multi_id
),
offset_ref
(
offset
)
__host__
__device__
constexpr
lambda_GetOffsetFromMultiIndex
(
Array
<
index_t
,
NSize
>&
multi_id_
,
index_t
&
offset_
)
:
multi_id
(
multi_id_
),
offset
(
offset_
)
{
}
template
<
index_t
IDim
>
__host__
__device__
constexpr
bool
operator
()(
Number
<
IDim
>
)
const
template
<
class
X
>
__host__
__device__
constexpr
void
operator
()(
X
IDim
)
const
{
offset_ref
+=
multi_id_ref
.
Get
(
Number
<
IDim
>
{})
*
Type
::
GetStride
(
Number
<
IDim
>
{});
return
true
;
offset
+=
multi_id
.
Get
(
IDim
)
*
Type
::
GetStride
(
IDim
);
}
};
...
...
@@ -137,11 +142,10 @@ struct ConstantTensorDescriptor
index_t
offset
=
0
;
static_for
<
0
,
nDim
,
1
>
{}(
GetOffsetFromMultiIndex
_impl
<
NSize
>
(
multi_id
,
offset
));
static_for
<
0
,
nDim
,
1
>
{}(
lambda_
GetOffsetFromMultiIndex
<
NSize
>
(
multi_id
,
offset
));
return
offset
;
}
#endif
template
<
class
...
Is
>
__host__
__device__
static
constexpr
index_t
GetOffsetFromMultiIndex
(
Is
...
is
)
...
...
@@ -160,47 +164,26 @@ struct ConstantTensorDescriptor
multi_id
*
GetStrides
(),
mod_conv
::
plus
<
index_t
>
{},
Number
<
0
>
{});
}
#if 0
__host__ __device__ static constexpr Array<index_t, nDim> GetMultiIndexFrom1dIndex(index_t id)
{
Array<index_t, nDim> multi_id;
constexpr auto dummy_strides = calculate_tensor_strides_packed(GetLengths());
// calculate index in each of the dimensions in the order of their dimension
static_for<0, nDim - 1, 1>{}([&](auto IDim) {
constexpr index_t idim = IDim.Get();
constexpr index_t stride = dummy_strides.Get(Number<idim>{});
multi_id[idim] = id / stride;
id -= multi_id[idim] * stride;
});
multi_id[nDim - 1] = id / dummy_strides.Get(Number<nDim - 1>{});
return multi_id;
}
#else
struct
GetMultiIndexFrom1dIndex_impl
// emulate constexpr lambda
template
<
class
PackedStrides
>
struct
lambda_GetMultiIndexFrom1dIndex
{
using
DummyStrides
=
decltype
(
calculate_tensor_strides_packed
(
GetLengths
()));
index_t
&
id_ref
;
Array
<
index_t
,
nDim
>&
multi_id_ref
;
index_t
&
id
;
Array
<
index_t
,
nDim
>&
multi_id
;
__host__
__device__
constexpr
GetMultiIndexFrom1dIndex_impl
(
index_t
&
id
,
Array
<
index_t
,
nDim
>&
multi_id
)
:
id_ref
(
id
),
multi_id_ref
(
multi_id
)
__host__
__device__
constexpr
lambda_GetMultiIndexFrom1dIndex
(
index_t
&
id_
,
Array
<
index_t
,
nDim
>&
multi_id_
)
:
id
(
id_
),
multi_id
(
multi_id_
)
{
}
template
<
index_t
IDim
>
__host__
__device__
constexpr
bool
operator
()(
Number
<
IDim
>
)
const
template
<
class
X
>
__host__
__device__
constexpr
void
operator
()(
X
IDim
)
const
{
constexpr
index_t
stride
=
DummyStrides
::
Get
(
Number
<
IDim
>
{});
multi_id_ref
.
Set
(
Number
<
IDim
>
{},
id_ref
/
stride
);
id_ref
-=
multi_id_ref
.
Get
(
Number
<
IDim
>
{})
*
stride
;
return
true
;
constexpr
index_t
stride
=
PackedStrides
::
Get
(
IDim
);
multi_id
.
Set
(
IDim
,
id
/
stride
);
id
-=
multi_id
[
IDim
]
*
stride
;
}
};
...
...
@@ -208,27 +191,15 @@ struct ConstantTensorDescriptor
{
Array
<
index_t
,
nDim
>
multi_id
;
constexpr
auto
dummy_strides
=
calculate_tensor_strides_packed
(
GetLengths
());
using
PackedStrides
=
decltype
(
calculate_tensor_strides_packed
(
GetLengths
())
)
;
// calculate index in each of the dimensions in the order of their dimension
static_for
<
0
,
nDim
-
1
,
1
>
{}(
GetMultiIndexFrom1dIndex_impl
(
id
,
multi_id
));
index_t
itmp
=
id
/
dummy_strides
.
Get
(
Number
<
nDim
-
1
>
{});
static_for
<
0
,
nDim
-
1
,
1
>
{}(
lambda_GetMultiIndexFrom1dIndex
<
PackedStrides
>
(
id
,
multi_id
));
multi_id
.
Set
(
Number
<
nDim
-
1
>
{},
i
tmp
);
multi_id
.
Set
(
Number
<
nDim
-
1
>
{},
i
d
/
PackedStrides
::
Get
(
Number
<
nDim
-
1
>
{})
);
return
multi_id
;
}
#endif
#if 0
// return type is Sequence<...>
template<index_t Id>
__host__ __device__ static constexpr auto GetMultiIndexFrom1dIndex(Number<Id>)
{
return inclusive_scan_sequence(f_impl, GetStrides(), Number<Id>{});
}
#endif
__host__
__device__
static
constexpr
auto
GetOriginalMultiIndexFromMultiIndex
(
Array
<
index_t
,
nDim
>
multi_id
)
...
...
@@ -236,9 +207,10 @@ struct ConstantTensorDescriptor
return
multi_id
;
}
// This function doesn't do carry check on the highest dimension, for performance reason.
// It is the user's responsibility to make sure the result "new_mutli_id" is not out-of-bound
// on the highest dimension
// This function doesn't do carry check on the highest dimension for positive stepping (or
// borrow check on the lowest dimension for negative stepping) , for performance reason. It is
// the user's responsibility to make sure the result "new_mutli_id" is not out-of-bound on the
// highest dimension for positive stepping (or on the lowest dimension for negative stepping)
template
<
bool
PositiveDirection
>
__host__
__device__
static
Array
<
index_t
,
nDim
>
UpdateMultiIndexGivenStepSizeOf1dIndex
(
Array
<
index_t
,
nDim
>
old_multi_id
,
...
...
@@ -262,14 +234,14 @@ struct ConstantTensorDescriptor
if
(
carry
)
{
++
new_multi_id
[
idim
]
;
++
new_multi_id
(
idim
)
;
}
carry
=
false
;
if
(
new_multi_id
[
idim
]
>=
GetLength
(
IDim
))
{
new_multi_id
[
idim
]
-=
GetLength
(
IDim
);
new_multi_id
(
idim
)
-=
GetLength
(
IDim
);
carry
=
true
;
}
});
...
...
@@ -288,14 +260,14 @@ struct ConstantTensorDescriptor
if
(
borrow
)
{
--
new_multi_id
[
idim
]
;
--
new_multi_id
(
idim
)
;
}
borrow
=
false
;
if
(
new_multi_id
[
idim
]
<
GetLength
(
IDim
))
{
new_multi_id
[
idim
]
+=
GetLength
(
IDim
);
new_multi_id
(
idim
)
+=
GetLength
(
IDim
);
borrow
=
true
;
}
});
...
...
@@ -382,15 +354,7 @@ struct ConstantTensorDescriptor
return
ConstantTensorDescriptor
<
decltype
(
new_lengths
),
decltype
(
new_strides
)
>
{};
}
template
<
index_t
Threashold
,
index_t
Delta
>
struct
f_unfold_impl
{
__host__
__device__
constexpr
index_t
operator
()(
index_t
x
)
const
{
return
x
>
Threashold
?
x
-
Delta
:
x
;
}
};
// this function unfold dimension [FirstUnfoldDim, ..., LastUnfoldDim] into 1 dimension
template
<
index_t
FirstUnfoldDim
,
index_t
LastUnfoldDim
>
__host__
__device__
static
constexpr
auto
Unfold
(
Number
<
FirstUnfoldDim
>
,
Number
<
LastUnfoldDim
>
)
{
...
...
@@ -398,24 +362,6 @@ struct ConstantTensorDescriptor
FirstUnfoldDim
<=
LastUnfoldDim
,
"wrong! should have FirstUnfoldDim <= LastUnfoldDim!"
);
#if 0 // cannot compile: compiler complain about constexpr
// dimensions to be unfold need to be in descending order (w.r.t. strides), and need to be
// packed in memory, otherwise, unfolding is invalid
static_for<FirstUnfoldDim, LastUnfoldDim, 1>{}([&](auto IDim_) {
constexpr auto IDim = decltype(IDim_){};
constexpr auto IDim_p1 = IDim + Number<1>{};
// check stride
static_assert(
GetStride(IDim) >= GetStride(IDim_p1),
"wrong! dimensions to be unfolded need to be in descending order w.r.t strides");
// check if packed
static_assert(GetStride(IDim_p1) * GetLength(IDim_p1) == GetStride(IDim),
"wrong! dimensions to be unfolded need to be packed");
});
#endif
// left and right
constexpr
auto
left
=
typename
arithmetic_sequence_gen
<
0
,
FirstUnfoldDim
,
1
>::
SeqType
{};
constexpr
auto
middle
=
...
...
@@ -423,6 +369,9 @@ struct ConstantTensorDescriptor
constexpr
auto
right
=
typename
arithmetic_sequence_gen
<
LastUnfoldDim
+
1
,
GetNumOfDimension
(),
1
>::
SeqType
{};
// dimensions to be unfolded need to be continuous
static_assert
(
Type
::
Extract
(
middle
).
AreDimensionsContinuous
(),
"wrong! not unfoldable"
);
// unfolded length, stride
constexpr
index_t
unfold_length
=
accumulate_on_sequence
(
GetLengths
().
Extract
(
middle
),
mod_conv
::
multiplies
<
index_t
>
{},
Number
<
1
>
{});
...
...
@@ -446,16 +395,16 @@ struct ConstantTensorDescriptor
template
<
class
MapNew2Old
>
__host__
__device__
static
constexpr
auto
ReorderGivenNew2Old
(
MapNew2Old
)
{
return
ConstantTensorDescriptor
<
decltype
(
Lengths
{}.
ReorderGivenNew2Old
(
MapNew2Old
{})),
decltype
(
Strides
{}.
ReorderGivenNew2Old
(
MapNew2Old
{}))
>
{};
return
ConstantTensorDescriptor
<
decltype
(
Lengths
::
ReorderGivenNew2Old
(
MapNew2Old
{})),
decltype
(
Strides
::
ReorderGivenNew2Old
(
MapNew2Old
{}))
>
{};
}
#if 0 // require sequence_sort, which is not implemented yet
template <class MapOld2New>
__host__ __device__ static constexpr auto ReorderGivenOld2New(MapOld2New)
{
return ConstantTensorDescriptor<decltype(Lengths
{}.
ReorderGivenOld2New(MapOld2New{})),
decltype(Strides
{}.
ReorderGivenOld2New(MapOld2New{}))>{}
return ConstantTensorDescriptor<decltype(Lengths
::
ReorderGivenOld2New(MapOld2New{})),
decltype(Strides
::
ReorderGivenOld2New(MapOld2New{}))>{}
}
#endif
};
...
...
src/include/Sequence.hip.hpp
View file @
7a89684f
...
...
@@ -16,7 +16,23 @@ struct Sequence
{
static_assert
(
I
<
mSize
,
"wrong! I too large"
);
// the last dummy element is to prevent compiler complain about empty Sequence
// the last dummy element is to prevent compiler complain about empty array, when mSize = 0
const
index_t
mData
[
mSize
+
1
]
=
{
Is
...,
0
};
return
mData
[
I
];
}
template
<
index_t
I
>
__host__
__device__
constexpr
index_t
operator
[](
Number
<
I
>
)
const
{
static_assert
(
I
<
mSize
,
"wrong! I too large"
);
const
index_t
mData
[
mSize
+
1
]
=
{
Is
...,
0
};
return
mData
[
I
];
}
// make sure I is constepxr
__host__
__device__
constexpr
index_t
operator
[](
index_t
I
)
const
{
const
index_t
mData
[
mSize
+
1
]
=
{
Is
...,
0
};
return
mData
[
I
];
}
...
...
@@ -30,6 +46,9 @@ struct Sequence
"wrong! invalid new2old map");
#endif
static_assert
(
sizeof
...(
Is
)
==
sizeof
...(
IRs
),
"wrong! new2old map should have the same size as Sequence to be rerodered"
);
return
Sequence
<
Type
{}.
Get
(
Number
<
IRs
>
{})...
>
{};
}
...
...
@@ -322,11 +341,6 @@ __host__ __device__ constexpr auto operator-(Sequence<Xs...> seq_x, Sequence<Ys.
{
static_assert
(
sizeof
...(
Xs
)
==
sizeof
...(
Ys
),
"wrong! inconsistent size"
);
#if 0
static_for<0, seq_x.GetSize(), 1>{}(
[&](auto I) { static_assert(seq_x.Get(I) >= seq_y.Get(I), "wrong! going to undeflow"); });
#endif
return
Sequence
<
(
Xs
-
Ys
)...
>
{};
}
...
...
@@ -363,15 +377,6 @@ __host__ __device__ constexpr auto operator+(Sequence<Xs...>, Number<Y>)
template
<
index_t
...
Xs
,
index_t
Y
>
__host__
__device__
constexpr
auto
operator
-
(
Sequence
<
Xs
...
>
,
Number
<
Y
>
)
{
#if 0 // TODO: turn it on. Doesn't compile
constexpr auto seq_x = Sequence<Xs...>{};
static_for<0, sizeof...(Xs), 1>{}([&](auto Iter) {
constexpr auto I = decltype(Iter){};
static_assert(seq_x.Get(I) >= Y, "wrong! going to underflow");
});
#endif
return
Sequence
<
(
Xs
-
Y
)...
>
{};
}
...
...
@@ -404,13 +409,6 @@ __host__ __device__ constexpr auto operator-(Number<Y>, Sequence<Xs...>)
{
constexpr
auto
seq_x
=
Sequence
<
Xs
...
>
{};
#if 0
static_for<0, sizeof...(Xs), 1>{}([&](auto Iter) {
constexpr auto I = decltype(Iter){};
static_assert(seq_x.Get(I) <= Y, "wrong! going to underflow");
});
#endif
return
Sequence
<
(
Y
-
Xs
)...
>
{};
}
...
...
@@ -482,25 +480,6 @@ __host__ __device__ constexpr auto inclusive_scan_sequence(Seq, Reduce, Number<I
return
reverse_inclusive_scan_sequence
(
Seq
{}.
Reverse
(),
Reduce
{},
Number
<
Init
>
{}).
Reverse
();
}
template
<
class
Seq
>
struct
accumulate_on_sequence_impl
{
template
<
class
IDim
>
__host__
__device__
constexpr
index_t
operator
()(
IDim
)
const
{
return
Seq
{}.
Get
(
IDim
{});
}
};
template
<
class
Seq
,
class
Reduce
,
index_t
I
>
__host__
__device__
constexpr
index_t
accumulate_on_sequence
(
Seq
,
Reduce
,
Number
<
I
>
/*initial_value*/
)
{
constexpr
index_t
a
=
static_const_reduce_n
<
Seq
::
mSize
>
{}(
accumulate_on_sequence_impl
<
Seq
>
{},
Reduce
{});
return
Reduce
{}(
a
,
I
);
}
template
<
index_t
...
Is
>
__host__
__device__
constexpr
auto
Sequence
<
Is
...
>::
PopFront
()
{
...
...
src/include/blockwise_generic_tensor_slice_op.hip.hpp
View file @
7a89684f
...
...
@@ -122,7 +122,7 @@ struct BlockwiseGenericTensorSliceCopy_v1
constexpr
auto
src_partial_original_desc
=
SrcDesc
::
GetOriginalTensorDescriptor
().
Extract
(
src_partial_original_dims
);
mThreadSrcPartialOffsets
[
idim
]
=
src_partial_original_desc
.
GetOffsetFromMultiIndex
(
mThreadSrcPartialOffsets
(
idim
)
=
src_partial_original_desc
.
GetOffsetFromMultiIndex
(
extract_array
(
mThreadSrcOriginalMultiId
,
src_partial_original_dims
));
});
...
...
@@ -136,7 +136,7 @@ struct BlockwiseGenericTensorSliceCopy_v1
constexpr
auto
dst_partial_original_desc
=
DstDesc
::
GetOriginalTensorDescriptor
().
Extract
(
dst_partial_original_dims
);
mThreadDstPartialOffsets
[
idim
]
=
dst_partial_original_desc
.
GetOffsetFromMultiIndex
(
mThreadDstPartialOffsets
(
idim
)
=
dst_partial_original_desc
.
GetOffsetFromMultiIndex
(
extract_array
(
mThreadDstOriginalMultiId
,
dst_partial_original_dims
));
});
...
...
@@ -369,7 +369,7 @@ struct BlockwiseGenericTensorSliceCopy_v1
constexpr
auto
I
=
decltype
(
I_
){};
constexpr
index_t
idim_original
=
src_partial_original_dims
.
Get
(
I
);
mThreadSrcOriginalMultiId
[
idim_original
]
=
mThreadSrcOriginalMultiId
(
idim_original
)
=
new_src_partial_original_multi_id
[
I
.
Get
()];
});
...
...
@@ -381,7 +381,7 @@ struct BlockwiseGenericTensorSliceCopy_v1
new_src_partial_original_multi_id
);
// update "mThreadSrcPartialOffsets"
mThreadSrcPartialOffsets
[
idim
]
=
new_src_partial_offset
;
mThreadSrcPartialOffsets
(
idim
)
=
new_src_partial_offset
;
// update "mThreadSrcOffset", do "+" before "-" to avoid underflow
mThreadSrcOffset
=
(
mThreadSrcOffset
+
new_src_partial_offset
)
-
old_src_partial_offset
;
...
...
@@ -401,15 +401,15 @@ struct BlockwiseGenericTensorSliceCopy_v1
static_if
<
PositiveDirection
>
{}([
&
](
auto
fwd
)
{
mThreadSrcOffset
+=
StepSize
*
fwd
(
SrcDesc
{}).
GetStride
(
IDim
);
mThreadSrcOriginalMultiId
[
idim_original
]
+=
StepSize
;
mThreadSrcOriginalMultiId
(
idim_original
)
+=
StepSize
;
mThreadSrcPartialOffsets
[
idim
]
+=
StepSize
*
fwd
(
SrcDesc
{}).
GetStride
(
IDim
);
mThreadSrcPartialOffsets
(
idim
)
+=
StepSize
*
fwd
(
SrcDesc
{}).
GetStride
(
IDim
);
}).
Else
([
&
](
auto
fwd
)
{
mThreadSrcOffset
-=
StepSize
*
fwd
(
SrcDesc
{}).
GetStride
(
IDim
);
mThreadSrcOriginalMultiId
[
idim_original
]
-=
StepSize
;
mThreadSrcOriginalMultiId
(
idim_original
)
-=
StepSize
;
mThreadSrcPartialOffsets
[
idim
]
-=
StepSize
*
fwd
(
SrcDesc
{}).
GetStride
(
IDim
);
mThreadSrcPartialOffsets
(
idim
)
-=
StepSize
*
fwd
(
SrcDesc
{}).
GetStride
(
IDim
);
});
});
}
...
...
src/include/common.hip.hpp
View file @
7a89684f
...
...
@@ -110,7 +110,7 @@ __host__ __device__ constexpr T min(T x, Ts... xs)
// this is wrong
// TODO: implement correct least common multiple, instead of calling max()
template
<
class
T
,
class
...
Ts
>
__host__
__device__
constexpr
T
l
east_common_multiple
(
T
x
,
Ts
...
xs
)
__host__
__device__
constexpr
T
l
cm
(
T
x
,
Ts
...
xs
)
{
return
max
(
x
,
xs
...);
}
...
...
src/include/functional.hip.hpp
View file @
7a89684f
...
...
@@ -19,18 +19,7 @@ struct swallow
}
};
#if 0
template<class F>
__host__ __device__ constexpr auto unpacker(F f)
{
return [=](auto xs_array){ f(xs...); };
}
#endif
// Emulate compile time if statement for C++14
// Get the idea from
// "https://baptiste-wicht.com/posts/2015/07/simulate-static_if-with-c11c14.html"
// TODO: use if constexpr, when C++17 is supported
// Emulate if constexpr
template
<
bool
Predicate
>
struct
static_if
{
...
...
@@ -81,28 +70,3 @@ struct static_if<false>
return
Type
{};
}
};
template
<
index_t
NLoop
>
struct
static_const_reduce_n
{
// signature of F: F(Number<I>)
template
<
class
F
,
class
Reduce
>
__host__
__device__
constexpr
auto
operator
()(
F
f
,
Reduce
r
)
const
{
static_assert
(
NLoop
>
1
,
"out-of-range"
);
constexpr
auto
a
=
f
(
Number
<
NLoop
-
1
>
{});
auto
b
=
static_const_reduce_n
<
NLoop
-
1
>
{}(
f
,
r
);
// TODO: cannot use constexpr here, weird
return
r
(
a
,
b
);
}
};
template
<
>
struct
static_const_reduce_n
<
1
>
{
template
<
class
F
,
class
Reduce
>
__host__
__device__
constexpr
auto
operator
()(
F
f
,
Reduce
)
const
{
return
f
(
Number
<
0
>
{});
}
};
src/include/functional2.hip.hpp
View file @
7a89684f
...
...
@@ -2,29 +2,16 @@
#include "functional.hip.hpp"
#include "Sequence.hip.hpp"
#if 0
template <index_t Iter, index_t Remaining, index_t Increment>
struct static_for_impl
{
template <class F>
constexpr __host__ __device__ void operator()(F f) const
{
static_assert(Remaining % Increment == 0, "wrong! Remaining % Increment != 0");
static_assert(Increment <= Remaining, "will go out-of-range");
f(Number<Iter>{});
static_for_impl<Iter + Increment, Remaining - Increment, Increment>{}(f);
}
};
template
<
class
>
struct
static_for_impl
;
template <index_t
Iter, index_t Increment
>
struct static_for_impl<
Iter, 0, Increment
>
template
<
index_t
...
Is
>
struct
static_for_impl
<
Sequence
<
Is
...
>
>
{
template
<
class
F
>
constexpr
__host__ __device__ void operator()(F) const
__host__
__device__
constexpr
void
operator
()(
F
f
)
const
{
// no work left, just return
return;
swallow
{(
f
(
Number
<
Is
>
{}),
0
)...};
}
};
...
...
@@ -33,48 +20,42 @@ template <index_t NBegin, index_t NEnd, index_t Increment>
struct
static_for
{
template
<
class
F
>
constexpr
__host__ __device__ void operator()(F f) const
__host__
__device__
constexpr
void
operator
()(
F
f
)
const
{
static_assert
(
NBegin
<=
NEnd
,
"wrongs! should have NBegin <= NEnd"
);
static_assert
((
NEnd
-
NBegin
)
%
Increment
==
0
,
"Wrong! should satisfy (NEnd - NBegin) % Increment == 0"
);
#if 0
static_if<(NBegin < NEnd)>{}(
[&](auto fwd) { static_for_impl<NBegin, NEnd - NBegin, fwd(Increment)>{}(f); });
#else
static_for_impl<NBegin, NEnd - NBegin, Increment>{}(f);
#endif
static_for_impl
<
typename
arithmetic_sequence_gen
<
NBegin
,
NEnd
,
Increment
>::
SeqType
>
{}(
f
);
}
};
#else
template
<
class
>
struct
static_for_impl
;
template
<
index_t
...
Is
>
struct
static_for_impl
<
Sequence
<
Is
...
>>
template
<
class
Seq
,
class
Reduce
>
struct
lambda_accumulate_on_sequence
{
template
<
class
F
>
__host__
__device__
constexpr
void
operator
()(
F
f
)
const
const
Reduce
&
f
;
index_t
&
result
;
__host__
__device__
constexpr
lambda_accumulate_on_sequence
(
const
Reduce
&
f_
,
index_t
&
result_
)
:
f
(
f_
),
result
(
result_
)
{
swallow
{(
f
(
Number
<
Is
>
{}),
0
)...};
}
template
<
class
IDim
>
__host__
__device__
constexpr
index_t
operator
()(
IDim
)
const
{
return
result
=
f
(
result
,
Seq
::
Get
(
IDim
{}));
}
};
// F signature: F(Number<Iter>)
template
<
index_t
NBegin
,
index_t
NEnd
,
index_t
Increment
>
struct
static_for
template
<
class
Seq
,
class
Reduce
,
index_t
Init
>
__host__
__device__
constexpr
index_t
accumulate_on_sequence
(
Seq
,
Reduce
f
,
Number
<
Init
>
/*initial_value*/
)
{
template
<
class
F
>
__host__
__device__
constexpr
void
operator
()(
F
f
)
const
{
static_assert
(
NBegin
<=
NEnd
,
"wrongs! should have NBegin <= NEnd"
);
index_t
result
=
Init
;
static_assert
((
NEnd
-
NBegin
)
%
Increment
==
0
,
"Wrong! should satisfy (NEnd - NBegin) % Increment == 0"
);
static_for
<
0
,
Seq
::
mSize
,
1
>
{}(
lambda_accumulate_on_sequence
<
Seq
,
Reduce
>
(
f
,
result
));
static_for_impl
<
typename
arithmetic_sequence_gen
<
NBegin
,
NEnd
,
Increment
>::
SeqType
>
{}(
f
);
}
};
#endif
return
result
;
}
src/include/gridwise_convolution_implicit_gemm_v1r1_chwn_cyxk_khwn.hip.hpp
View file @
7a89684f
...
...
@@ -103,7 +103,7 @@ struct GridwiseConvolutionImplicitGemm_v1r1_chwn_cyxk_khwn
// tensor view of blockwise input and weight in LDS
// be careful of alignment
constexpr
index_t
max_align
=
mod_conv
::
max
(
InBlockCopyDataPerRead_N
,
constexpr
index_t
max_align
=
mod_conv
::
lcm
(
InBlockCopyDataPerRead_N
,
WeiBlockCopyDataPerRead_K
,
GemmDataPerReadA
,
GemmDataPerReadB
);
...
...
@@ -119,11 +119,11 @@ struct GridwiseConvolutionImplicitGemm_v1r1_chwn_cyxk_khwn
constexpr
auto
wei_cyx_k_block_desc
=
make_ConstantTensorDescriptor_aligned
(
Sequence
<
CPerBlock
*
Y
*
X
,
KPerBlock
>
{},
Number
<
mod_conv
::
max
(
WeiBlockCopyDataPerRead_K
,
GemmDataPerReadA
)
>
{});
Number
<
mod_conv
::
lcm
(
WeiBlockCopyDataPerRead_K
,
GemmDataPerReadA
)
>
{});
constexpr
auto
wei_c_y_x_k_block_desc
=
make_ConstantTensorDescriptor_aligned
(
Sequence
<
CPerBlock
,
Y
,
X
,
KPerBlock
>
{},
Number
<
mod_conv
::
max
(
WeiBlockCopyDataPerRead_K
,
GemmDataPerReadA
)
>
{});
Number
<
mod_conv
::
lcm
(
WeiBlockCopyDataPerRead_K
,
GemmDataPerReadA
)
>
{});
// tensor view of threadwise output in register
constexpr
auto
out_k_h_w_n_thread_desc
=
make_ConstantTensorDescriptor
(
...
...
src/include/gridwise_convolution_implicit_gemm_v1r2_chwn_cyxk_khwn.hip.hpp
View file @
7a89684f
...
...
@@ -104,7 +104,7 @@ struct GridwiseConvolutionImplicitGemm_v1r2_chwn_cyxk_khwn
// LDS tensor view
// be careful of alignment
constexpr
index_t
max_align
=
mod_conv
::
max
(
InBlockCopyDataPerRead_N
,
constexpr
index_t
max_align
=
mod_conv
::
lcm
(
InBlockCopyDataPerRead_N
,
WeiBlockCopyDataPerRead_K
,
GemmDataPerReadA
,
GemmDataPerReadB
);
...
...
@@ -120,7 +120,7 @@ struct GridwiseConvolutionImplicitGemm_v1r2_chwn_cyxk_khwn
constexpr
auto
wei_c_x_k_block_desc
=
make_ConstantTensorDescriptor_aligned
(
Sequence
<
CPerBlock
,
X
,
KPerBlock
>
{},
Number
<
mod_conv
::
max
(
WeiBlockCopyDataPerRead_K
,
GemmDataPerReadA
)
>
{});
Number
<
mod_conv
::
lcm
(
WeiBlockCopyDataPerRead_K
,
GemmDataPerReadA
)
>
{});
// tensor view of threadwise output in register
constexpr
auto
out_k_h_w_n_thread_desc
=
make_ConstantTensorDescriptor
(
...
...
src/include/gridwise_convolution_implicit_gemm_v1r2_nchw_cyxk_khwn.hip.hpp
View file @
7a89684f
...
...
@@ -108,7 +108,7 @@ struct GridwiseConvolutionImplicitGemm_v1r2_nchw_cyxk_khwn
// LDS tensor view
// be careful of alignment
constexpr
index_t
max_align
=
mod_conv
::
max
(
InBlockReorderDataPerWrite_N
,
constexpr
index_t
max_align
=
mod_conv
::
lcm
(
InBlockReorderDataPerWrite_N
,
WeiBlockCopyDataPerRead_K
,
GemmDataPerReadA
,
GemmDataPerReadB
);
...
...
src/include/gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn.hip.hpp
View file @
7a89684f
...
...
@@ -99,7 +99,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn
// LDS tensor view
// be careful of alignment
constexpr
index_t
max_align
=
mod_conv
::
max
(
InBlockCopyDataPerRead_N
,
constexpr
index_t
max_align
=
mod_conv
::
lcm
(
InBlockCopyDataPerRead_N
,
WeiBlockCopyDataPerRead_K
,
GemmDataPerReadA
,
GemmDataPerReadB
);
...
...
@@ -115,7 +115,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn
constexpr
auto
wei_c_k_block_desc
=
make_ConstantTensorDescriptor_aligned
(
Sequence
<
CPerBlock
,
KPerBlock
>
{},
Number
<
mod_conv
::
max
(
WeiBlockCopyDataPerRead_K
,
GemmDataPerReadA
)
>
{});
Number
<
mod_conv
::
lcm
(
WeiBlockCopyDataPerRead_K
,
GemmDataPerReadA
)
>
{});
// tensor view of threadwise output in register
constexpr
auto
out_k_h_w_n_thread_desc
=
make_ConstantTensorDescriptor
(
...
...
src/include/gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_chwn_cyxk_khwn.hip.hpp
View file @
7a89684f
...
...
@@ -104,7 +104,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_chwn_cyxk_khwn
// LDS tensor view
// be careful of alignment
constexpr
index_t
max_align
=
mod_conv
::
max
(
InBlockCopyDataPerRead_N
,
constexpr
index_t
max_align
=
mod_conv
::
lcm
(
InBlockCopyDataPerRead_N
,
WeiBlockCopyDataPerRead_K
,
GemmDataPerReadA
,
GemmDataPerReadB
);
...
...
@@ -120,7 +120,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_chwn_cyxk_khwn
constexpr
auto
wei_c_k_block_desc
=
make_ConstantTensorDescriptor_aligned
(
Sequence
<
CPerBlock
,
KPerBlock
>
{},
Number
<
mod_conv
::
max
(
WeiBlockCopyDataPerRead_K
,
GemmDataPerReadA
)
>
{});
Number
<
mod_conv
::
lcm
(
WeiBlockCopyDataPerRead_K
,
GemmDataPerReadA
)
>
{});
// tensor view of threadwise output in register
constexpr
auto
out_k_h_w_n_thread_desc
=
make_ConstantTensorDescriptor_packed
(
...
...
src/include/gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_nchw_cyxk_khwn.hip.hpp
View file @
7a89684f
...
...
@@ -106,7 +106,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_khwn
// LDS tensor view
// be careful of alignment
constexpr
index_t
max_align
=
mod_conv
::
max
(
InBlockReorderDataPerWrite_N
,
constexpr
index_t
max_align
=
mod_conv
::
lcm
(
InBlockReorderDataPerWrite_N
,
WeiBlockCopyDataPerRead_K
,
GemmDataPerReadA
,
GemmDataPerReadB
);
...
...
@@ -122,7 +122,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_khwn
constexpr
auto
wei_c_k_block_desc
=
make_ConstantTensorDescriptor_aligned
(
Sequence
<
CPerBlock
,
KPerBlock
>
{},
Number
<
mod_conv
::
max
(
WeiBlockCopyDataPerRead_K
,
GemmDataPerReadA
)
>
{});
Number
<
mod_conv
::
lcm
(
WeiBlockCopyDataPerRead_K
,
GemmDataPerReadA
)
>
{});
// tensor view of threadwise output in register
constexpr
auto
out_k_h_w_n_thread_desc
=
make_ConstantTensorDescriptor_packed
(
...
...
src/include/gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_nchw_cyxk_nkhw.hip.hpp
View file @
7a89684f
...
...
@@ -105,7 +105,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_nkhw
// LDS tensor view
// be careful of alignment
constexpr
index_t
max_align
=
mod_conv
::
max
(
InBlockReorderDataPerWrite_N
,
constexpr
index_t
max_align
=
mod_conv
::
lcm
(
InBlockReorderDataPerWrite_N
,
WeiBlockCopyDataPerRead_K
,
GemmDataPerReadA
,
GemmDataPerReadB
);
...
...
@@ -121,7 +121,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_nkhw
constexpr
auto
wei_c_k_block_desc
=
make_ConstantTensorDescriptor_aligned
(
Sequence
<
CPerBlock
,
KPerBlock
>
{},
Number
<
mod_conv
::
max
(
WeiBlockCopyDataPerRead_K
,
GemmDataPerReadA
)
>
{});
Number
<
mod_conv
::
lcm
(
WeiBlockCopyDataPerRead_K
,
GemmDataPerReadA
)
>
{});
// tensor view of threadwise output in register
constexpr
auto
out_k_h_w_n_thread_desc
=
make_ConstantTensorDescriptor_packed
(
...
...
src/include/gridwise_convolution_implicit_gemm_v1r3_nchw_cyxk_khwn.hip.hpp
View file @
7a89684f
...
...
@@ -104,7 +104,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_khwn
// LDS tensor view
// be careful of alignment
constexpr
index_t
max_align
=
mod_conv
::
max
(
InBlockReorderDataPerWrite_N
,
constexpr
index_t
max_align
=
mod_conv
::
lcm
(
InBlockReorderDataPerWrite_N
,
WeiBlockCopyDataPerRead_K
,
GemmDataPerReadA
,
GemmDataPerReadB
);
...
...
@@ -120,7 +120,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_khwn
constexpr
auto
wei_c_k_block_desc
=
make_ConstantTensorDescriptor_aligned
(
Sequence
<
CPerBlock
,
KPerBlock
>
{},
Number
<
mod_conv
::
max
(
WeiBlockCopyDataPerRead_K
,
GemmDataPerReadA
)
>
{});
Number
<
mod_conv
::
lcm
(
WeiBlockCopyDataPerRead_K
,
GemmDataPerReadA
)
>
{});
// tensor view of threadwise output in register
constexpr
auto
out_k_h_w_n_thread_desc
=
make_ConstantTensorDescriptor
(
...
...
src/include/gridwise_convolution_implicit_gemm_v1r3_nchw_cyxk_nkhw.hip.hpp
View file @
7a89684f
...
...
@@ -103,7 +103,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_nkhw
// LDS tensor view
// be careful of alignment
constexpr
index_t
max_align
=
mod_conv
::
max
(
InBlockReorderDataPerWrite_N
,
constexpr
index_t
max_align
=
mod_conv
::
lcm
(
InBlockReorderDataPerWrite_N
,
WeiBlockCopyDataPerRead_K
,
GemmDataPerReadA
,
GemmDataPerReadB
);
...
...
@@ -119,7 +119,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_nkhw
constexpr
auto
wei_c_k_block_desc
=
make_ConstantTensorDescriptor_aligned
(
Sequence
<
CPerBlock
,
KPerBlock
>
{},
Number
<
mod_conv
::
max
(
WeiBlockCopyDataPerRead_K
,
GemmDataPerReadA
)
>
{});
Number
<
mod_conv
::
lcm
(
WeiBlockCopyDataPerRead_K
,
GemmDataPerReadA
)
>
{});
// tensor view of threadwise output in register
constexpr
auto
out_k_h_w_n_thread_desc
=
make_ConstantTensorDescriptor_packed
(
...
...
src/include/gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hip.hpp
View file @
7a89684f
...
...
@@ -181,7 +181,7 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn
// LDS: be careful of alignment
constexpr
index_t
max_align
=
mod_conv
::
max
(
index_t
(
4
),
InBlockCopyDataPerRead
,
WeiBlockCopyDataPerRead
);
mod_conv
::
lcm
(
index_t
(
4
),
InBlockCopyDataPerRead
,
WeiBlockCopyDataPerRead
);
constexpr
index_t
in_block_space
=
in_cb_block_desc
.
GetElementSpace
(
Number
<
max_align
>
{});
...
...
src/include/gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn_lds_double_buffer.hip.hpp
View file @
7a89684f
...
...
@@ -185,7 +185,7 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer
// LDS: be careful of alignment
constexpr
index_t
max_align
=
mod_conv
::
max
(
index_t
(
4
),
InBlockCopyDataPerRead
,
WeiBlockCopyDataPerRead
);
mod_conv
::
lcm
(
index_t
(
4
),
InBlockCopyDataPerRead
,
WeiBlockCopyDataPerRead
);
constexpr
index_t
in_block_space
=
in_cb_block_desc
.
GetElementSpace
(
Number
<
max_align
>
{});
...
...
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment