Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
109f1e90
Commit
109f1e90
authored
Jun 07, 2019
by
Chao Liu
Browse files
Merge branch 'master' into implicit_gemm_v4_backward
parents
0b10c0bb
a68b16a5
Changes
27
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
420 additions
and
584 deletions
+420
-584
src/include/Array.hip.hpp
src/include/Array.hip.hpp
+94
-90
src/include/ConstantMergedTensorDescriptor.hip.hpp
src/include/ConstantMergedTensorDescriptor.hip.hpp
+26
-52
src/include/ConstantTensorDescriptor.hip.hpp
src/include/ConstantTensorDescriptor.hip.hpp
+77
-126
src/include/Sequence.hip.hpp
src/include/Sequence.hip.hpp
+34
-54
src/include/base.hip.hpp
src/include/base.hip.hpp
+113
-0
src/include/blockwise_generic_tensor_slice_op.hip.hpp
src/include/blockwise_generic_tensor_slice_op.hip.hpp
+21
-47
src/include/common.hip.hpp
src/include/common.hip.hpp
+1
-106
src/include/functional.hip.hpp
src/include/functional.hip.hpp
+1
-37
src/include/functional2.hip.hpp
src/include/functional2.hip.hpp
+28
-47
src/include/functional3.hip.hpp
src/include/functional3.hip.hpp
+6
-6
src/include/gridwise_convolution_implicit_gemm_v1r1_chwn_cyxk_khwn.hip.hpp
...ise_convolution_implicit_gemm_v1r1_chwn_cyxk_khwn.hip.hpp
+3
-3
src/include/gridwise_convolution_implicit_gemm_v1r2_chwn_cyxk_khwn.hip.hpp
...ise_convolution_implicit_gemm_v1r2_chwn_cyxk_khwn.hip.hpp
+2
-2
src/include/gridwise_convolution_implicit_gemm_v1r2_nchw_cyxk_khwn.hip.hpp
...ise_convolution_implicit_gemm_v1r2_nchw_cyxk_khwn.hip.hpp
+1
-1
src/include/gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn.hip.hpp
...ise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn.hip.hpp
+2
-2
src/include/gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_chwn_cyxk_khwn.hip.hpp
...plicit_gemm_v1r3_lds_double_buffer_chwn_cyxk_khwn.hip.hpp
+2
-2
src/include/gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_nchw_cyxk_khwn.hip.hpp
...plicit_gemm_v1r3_lds_double_buffer_nchw_cyxk_khwn.hip.hpp
+2
-2
src/include/gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_nchw_cyxk_nkhw.hip.hpp
...plicit_gemm_v1r3_lds_double_buffer_nchw_cyxk_nkhw.hip.hpp
+2
-2
src/include/gridwise_convolution_implicit_gemm_v1r3_nchw_cyxk_khwn.hip.hpp
...ise_convolution_implicit_gemm_v1r3_nchw_cyxk_khwn.hip.hpp
+2
-2
src/include/gridwise_convolution_implicit_gemm_v1r3_nchw_cyxk_nkhw.hip.hpp
...ise_convolution_implicit_gemm_v1r3_nchw_cyxk_nkhw.hip.hpp
+2
-2
src/include/gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hip.hpp
...dwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hip.hpp
+1
-1
No files found.
src/include/Array.hip.hpp
View file @
109f1e90
...
...
@@ -18,18 +18,22 @@ struct Array
__host__
__device__
constexpr
index_t
GetSize
()
const
{
return
NSize
;
}
__host__
__device__
constexpr
TData
operator
[](
index_t
i
)
const
{
return
mData
[
i
];
}
template
<
index_t
I
>
__host__
__device__
constexpr
TData
operator
[](
Number
<
I
>
)
const
{
return
mData
[
I
];
}
__host__
__device__
TData
&
operator
[](
index_t
i
)
{
return
mData
[
i
];
}
__host__
__device__
constexpr
TData
operator
[](
index_t
i
)
const
{
return
mData
[
i
];
}
template
<
index_t
I
>
__host__
__device__
constexpr
TData
Get
(
Number
<
I
>
)
const
__host__
__device__
TData
&
operator
()
(
Number
<
I
>
)
{
static_assert
(
I
<
NSize
,
"wrong!"
);
return
mData
[
I
];
}
__host__
__device__
TData
&
operator
()(
index_t
i
)
{
return
mData
[
i
];
}
template
<
index_t
I
>
__host__
__device__
constexpr
void
Set
(
Number
<
I
>
,
TData
x
)
{
...
...
@@ -38,16 +42,33 @@ struct Array
mData
[
I
]
=
x
;
}
__host__
__device__
constexpr
void
Set
(
index_t
I
,
TData
x
)
{
mData
[
I
]
=
x
;
}
struct
lambda_PushBack
// emulate constexpr lambda
{
const
Array
<
TData
,
NSize
>&
old_array
;
Array
<
TData
,
NSize
+
1
>&
new_array
;
__host__
__device__
constexpr
lambda_PushBack
(
const
Array
<
TData
,
NSize
>&
old_array_
,
Array
<
TData
,
NSize
+
1
>&
new_array_
)
:
old_array
(
old_array_
),
new_array
(
new_array_
)
{
}
template
<
index_t
I
>
__host__
__device__
constexpr
void
operator
()(
Number
<
I
>
)
const
{
new_array
.
Set
(
Number
<
I
>
{},
old_array
[
I
]);
}
};
__host__
__device__
constexpr
auto
PushBack
(
TData
x
)
const
{
Array
<
TData
,
NSize
+
1
>
new_array
;
static_for
<
0
,
NSize
,
1
>
{}([
&
](
auto
I
)
{
constexpr
index_t
i
=
I
.
Get
();
new_array
[
i
]
=
mData
[
i
];
});
static_for
<
0
,
NSize
,
1
>
{}(
lambda_PushBack
(
*
this
,
new_array
));
new_array
[
NSize
]
=
x
;
new_array
.
Set
(
Number
<
NSize
>
{},
x
)
;
return
new_array
;
}
...
...
@@ -62,93 +83,60 @@ __host__ __device__ constexpr auto sequence2array(Sequence<Is...>)
template
<
class
TData
,
index_t
NSize
>
__host__
__device__
constexpr
auto
make_zero_array
()
{
#if 0
Array<TData, NSize> a;
static_for<0, NSize, 1>{}([&](auto I) {
constexpr index_t i = I.Get();
a[i] = static_cast<TData>(0);
});
return a;
#else
constexpr
auto
zero_sequence
=
typename
uniform_sequence_gen
<
NSize
,
0
>::
SeqType
{};
constexpr
auto
zero_array
=
sequence2array
(
zero_sequence
);
return
zero_array
;
#endif
}
template
<
class
TData
,
index_t
NSize
,
index_t
...
IRs
>
__host__
__device__
constexpr
auto
reorder_array_given_new2old
(
const
Array
<
TData
,
NSize
>&
old_array
,
Sequence
<
IRs
...
>
new2old
)
Sequence
<
IRs
...
>
/*
new2old
*/
)
{
Array
<
TData
,
NSize
>
new_array
;
static_assert
(
NSize
==
sizeof
...(
IRs
),
"NSize not consistent"
);
static_for
<
0
,
NSize
,
1
>
{}([
&
](
auto
IDim
)
{
constexpr
index_t
idim
=
IDim
.
Get
();
new_array
[
idim
]
=
old_array
[
new2old
.
Get
(
IDim
)];
});
static_assert
(
is_valid_sequence_map
<
Sequence
<
IRs
...
>>::
value
,
"wrong! invalid reorder map"
);
return
new_array
;
return
Array
<
TData
,
NSize
>
{
old_array
.
mSize
[
IRs
]...}
;
}
#if 0
template <class TData, index_t NSize, index_t... IRs>
__host__ __device__ constexpr auto reorder_array_given_old2new(const Array<TData, NSize>& old_array,
Sequence<IRs...> old2new)
{
Array<TData, NSize> new_array;
static_assert(NSize == sizeof...(IRs), "NSize not consistent");
static_for<0, NSize, 1>{}([&](auto IDim) {
constexpr index_t idim = IDim.Get();
new_array[old2new.Get(IDim)] = old_array[idim];
});
return new_array;
}
#else
template
<
class
TData
,
index_t
NSize
,
class
MapOld2New
>
struct
reorder_array_given_old2new
_impl
struct
lambda_
reorder_array_given_old2new
{
const
Array
<
TData
,
NSize
>&
old_array
_ref
;
Array
<
TData
,
NSize
>&
new_array
_ref
;
const
Array
<
TData
,
NSize
>&
old_array
;
Array
<
TData
,
NSize
>&
new_array
;
__host__
__device__
constexpr
reorder_array_given_old2new_impl
(
const
Array
<
TData
,
NSize
>&
old_array
,
Array
<
TData
,
NSize
>&
new_array
)
:
old_array_ref
(
old_array
),
new_array_ref
(
new_array
)
__host__
__device__
constexpr
lambda_reorder_array_given_old2new
(
const
Array
<
TData
,
NSize
>&
old_array_
,
Array
<
TData
,
NSize
>&
new_array_
)
:
old_array
(
old_array_
),
new_array
(
new_array_
)
{
}
template
<
index_t
IOldDim
>
__host__
__device__
constexpr
void
operator
()(
Number
<
IOldDim
>
)
const
{
TData
old_data
=
old_array
_ref
.
Get
(
Number
<
IOldDim
>
{})
;
TData
old_data
=
old_array
[
IOldDim
]
;
constexpr
index_t
INewDim
=
MapOld2New
::
Get
(
Number
<
IOldDim
>
{});
new_array
_ref
.
Set
(
Number
<
INewDim
>
{},
old_data
);
new_array
.
Set
(
Number
<
INewDim
>
{},
old_data
);
}
};
template
<
class
TData
,
index_t
NSize
,
index_t
...
IRs
>
__host__
__device__
constexpr
auto
reorder_array_given_old2new
(
const
Array
<
TData
,
NSize
>&
old_array
,
Sequence
<
IRs
...
>
old2new
)
Sequence
<
IRs
...
>
/*
old2new
*/
)
{
Array
<
TData
,
NSize
>
new_array
;
static_assert
(
NSize
==
sizeof
...(
IRs
),
"NSize not consistent"
);
static_assert
(
is_valid_sequence_map
<
Sequence
<
IRs
...
>>::
value
,
"wrong! invalid reorder map"
);
static_for
<
0
,
NSize
,
1
>
{}(
reorder_array_given_old2new
_impl
<
TData
,
NSize
,
Sequence
<
IRs
...
>>
(
old_array
,
new_array
));
lambda_
reorder_array_given_old2new
<
TData
,
NSize
,
Sequence
<
IRs
...
>>
(
old_array
,
new_array
));
return
new_array
;
}
#endif
template
<
class
TData
,
index_t
NSize
,
class
ExtractSeq
>
__host__
__device__
constexpr
auto
extract_array
(
const
Array
<
TData
,
NSize
>&
old_array
,
ExtractSeq
)
...
...
@@ -159,25 +147,44 @@ __host__ __device__ constexpr auto extract_array(const Array<TData, NSize>& old_
static_assert
(
new_size
<=
NSize
,
"wrong! too many extract"
);
static_for
<
0
,
new_size
,
1
>
{}([
&
](
auto
I
)
{
constexpr
index_t
i
=
I
.
Get
();
new_array
[
i
]
=
old_array
[
ExtractSeq
::
Get
(
I
)];
});
static_for
<
0
,
new_size
,
1
>
{}([
&
](
auto
I
)
{
new_array
(
I
)
=
old_array
[
ExtractSeq
::
Get
(
I
)];
});
return
new_array
;
}
template
<
class
F
,
class
X
,
class
Y
,
class
Z
>
// emulate constepxr lambda for array math
struct
lambda_array_math
{
const
F
&
f
;
const
X
&
x
;
const
Y
&
y
;
Z
&
z
;
__host__
__device__
constexpr
lambda_array_math
(
const
F
&
f_
,
const
X
&
x_
,
const
Y
&
y_
,
Z
&
z_
)
:
f
(
f_
),
x
(
x_
),
y
(
y_
),
z
(
z_
)
{
}
template
<
index_t
IDim_
>
__host__
__device__
constexpr
void
operator
()(
Number
<
IDim_
>
)
const
{
constexpr
auto
IDim
=
Number
<
IDim_
>
{};
z
.
Set
(
IDim
,
f
(
x
[
IDim
],
y
[
IDim
]));
}
};
// Array = Array + Array
template
<
class
TData
,
index_t
NSize
>
__host__
__device__
constexpr
auto
operator
+
(
Array
<
TData
,
NSize
>
a
,
Array
<
TData
,
NSize
>
b
)
{
Array
<
TData
,
NSize
>
result
;
static_for
<
0
,
NSize
,
1
>
{}([
&
](
auto
I
)
{
constexpr
index_t
i
=
I
.
Get
();
auto
f
=
mod_conv
::
plus
<
index_t
>
{};
result
[
i
]
=
a
[
i
]
+
b
[
i
];
});
static_for
<
0
,
NSize
,
1
>
{}(
lambda_array_math
<
decltype
(
f
),
decltype
(
a
),
decltype
(
b
),
decltype
(
result
)
>
(
f
,
a
,
b
,
result
));
return
result
;
}
...
...
@@ -188,11 +195,11 @@ __host__ __device__ constexpr auto operator-(Array<TData, NSize> a, Array<TData,
{
Array
<
TData
,
NSize
>
result
;
static_for
<
0
,
NSize
,
1
>
{}([
&
](
auto
I
)
{
constexpr
index_t
i
=
I
.
Get
();
auto
f
=
mod_conv
::
minus
<
index_t
>
{};
result
[
i
]
=
a
[
i
]
-
b
[
i
];
});
static_for
<
0
,
NSize
,
1
>
{}(
lambda_array_math
<
decltype
(
f
),
decltype
(
a
),
decltype
(
b
),
decltype
(
result
)
>
(
f
,
a
,
b
,
result
));
return
result
;
}
...
...
@@ -205,11 +212,11 @@ __host__ __device__ constexpr auto operator+(Array<TData, NSize> a, Sequence<Is.
Array
<
TData
,
NSize
>
result
;
static_for
<
0
,
NSize
,
1
>
{}([
&
](
auto
I
)
{
constexpr
index_t
i
=
I
.
Get
();
auto
f
=
mod_conv
::
plus
<
index_t
>
{};
result
[
i
]
=
a
[
i
]
+
b
.
Get
(
I
);
});
static_for
<
0
,
NSize
,
1
>
{}(
lambda_array_math
<
decltype
(
f
),
decltype
(
a
),
decltype
(
b
),
decltype
(
result
)
>
(
f
,
a
,
b
,
result
));
return
result
;
}
...
...
@@ -222,11 +229,11 @@ __host__ __device__ constexpr auto operator-(Array<TData, NSize> a, Sequence<Is.
Array
<
TData
,
NSize
>
result
;
static_for
<
0
,
NSize
,
1
>
{}([
&
](
auto
I
)
{
constexpr
index_t
i
=
I
.
Get
();
auto
f
=
mod_conv
::
minus
<
index_t
>
{};
result
[
i
]
=
a
[
i
]
-
b
.
Get
(
I
);
});
static_for
<
0
,
NSize
,
1
>
{}(
lambda_array_math
<
decltype
(
f
),
decltype
(
a
),
decltype
(
b
),
decltype
(
result
)
>
(
f
,
a
,
b
,
result
));
return
result
;
}
...
...
@@ -239,11 +246,11 @@ __host__ __device__ constexpr auto operator*(Array<TData, NSize> a, Sequence<Is.
Array
<
TData
,
NSize
>
result
;
static_for
<
0
,
NSize
,
1
>
{}([
&
](
auto
I
)
{
constexpr
index_t
i
=
I
.
Get
();
auto
f
=
mod_conv
::
multiplies
<
index_t
>
{};
result
[
i
]
=
a
[
i
]
*
b
.
Get
(
I
);
});
static_for
<
0
,
NSize
,
1
>
{}(
lambda_array_math
<
decltype
(
f
),
decltype
(
a
),
decltype
(
b
),
decltype
(
result
)
>
(
f
,
a
,
b
,
result
));
return
result
;
}
...
...
@@ -256,11 +263,11 @@ __host__ __device__ constexpr auto operator-(Sequence<Is...> a, Array<TData, NSi
Array
<
TData
,
NSize
>
result
;
static_for
<
0
,
NSize
,
1
>
{}([
&
](
auto
I
)
{
constexpr
index_t
i
=
I
.
Get
();
auto
f
=
mod_conv
::
minus
<
index_t
>
{};
result
[
i
]
=
a
.
Get
(
I
)
-
b
[
i
];
});
static_for
<
0
,
NSize
,
1
>
{}(
lambda_array_math
<
decltype
(
f
),
decltype
(
a
),
decltype
(
b
),
decltype
(
result
)
>
(
f
,
a
,
b
,
result
));
return
result
;
}
...
...
@@ -273,10 +280,7 @@ accumulate_on_array(const Array<TData, NSize>& a, Reduce f, TData init)
static_assert
(
NSize
>
0
,
"wrong"
);
static_for
<
0
,
NSize
,
1
>
{}([
&
](
auto
I
)
{
constexpr
index_t
i
=
I
.
Get
();
result
=
f
(
result
,
a
[
i
]);
});
static_for
<
0
,
NSize
,
1
>
{}([
&
](
auto
I
)
{
result
=
f
(
result
,
a
[
I
]);
});
return
result
;
}
...
...
src/include/ConstantMergedTensorDescriptor.hip.hpp
View file @
109f1e90
...
...
@@ -9,6 +9,8 @@
template
<
class
OriginalTensorDesc
,
class
...
OriginalDimMergeSeqs
>
struct
ConstantMergedTensorDescriptor
{
using
Type
=
ConstantMergedTensorDescriptor
;
static
constexpr
auto
mOriginalDimMergeSeqs
=
std
::
tuple
<
OriginalDimMergeSeqs
...
>
{};
static
constexpr
index_t
nDim
=
sizeof
...(
OriginalDimMergeSeqs
);
...
...
@@ -74,43 +76,17 @@ struct ConstantMergedTensorDescriptor
return
OriginalTensorDesc
::
GetElementSize
();
}
#if 0
__host__ __device__ static constexpr auto
GetOriginalMultiIndexFromMultiIndex(Array<index_t, nDim> multi_id)
{
Array<index_t, nOriginalDim> original_multi_id;
static_for<0, nDim, 1>{}([&](auto IDim) {
constexpr index_t idim = IDim.Get();
constexpr auto original_dims_partial = std::get<idim>(mOriginalDimMergeSeqs);
// get partial original-multi-id corresponding to this merged dimension
const auto original_multi_id_partial =
OriginalTensorDesc::Extract(original_dims_partial)
.GetMultiIndexFrom1dIndex(multi_id[idim]);
static_for<0, original_dims_partial.GetSize(), 1>{}([&](auto I_) {
constexpr auto I = decltype(I_){};
constexpr index_t idim_original = original_dims_partial.Get(I);
original_multi_id[idim_original] = original_multi_id_partial[I.Get()];
});
});
return original_multi_id;
}
#else
template
<
class
OriginalDimsPartial
>
struct
GetOriginalMultiIndexFromMultiIndex
_impl1
struct
lambda_1_
GetOriginalMultiIndexFromMultiIndex
{
const
Array
<
index_t
,
OriginalDimsPartial
::
GetSize
()
>&
original_multi_id_partial
_ref
;
Array
<
index_t
,
nOriginalDim
>&
original_multi_id
_ref
;
__host__
__device__
constexpr
GetOriginalMultiIndexFromMultiIndex
_impl1
(
const
Array
<
index_t
,
OriginalDimsPartial
::
GetSize
()
>&
original_multi_id_partial
,
Array
<
index_t
,
nOriginalDim
>&
original_multi_id
)
:
original_multi_id_partial
_ref
(
original_multi_id_partial
),
original_multi_id
_ref
(
original_multi_id
)
const
Array
<
index_t
,
OriginalDimsPartial
::
GetSize
()
>&
original_multi_id_partial
;
Array
<
index_t
,
nOriginalDim
>&
original_multi_id
;
__host__
__device__
constexpr
lambda_1_
GetOriginalMultiIndexFromMultiIndex
(
const
Array
<
index_t
,
OriginalDimsPartial
::
GetSize
()
>&
original_multi_id_partial
_
,
Array
<
index_t
,
nOriginalDim
>&
original_multi_id
_
)
:
original_multi_id_partial
(
original_multi_id_partial
_
),
original_multi_id
(
original_multi_id
_
)
{
}
...
...
@@ -119,37 +95,36 @@ struct ConstantMergedTensorDescriptor
{
constexpr
index_t
idim_original
=
OriginalDimsPartial
::
Get
(
Number
<
I
>
{});
index_t
itmp
=
original_multi_id_partial
_ref
.
Get
(
Number
<
I
>
{})
;
index_t
itmp
=
original_multi_id_partial
[
I
]
;
original_multi_id
_ref
.
Set
(
Number
<
idim_original
>
{},
itmp
);
original_multi_id
.
Set
(
Number
<
idim_original
>
{},
itmp
);
}
};
struct
GetOriginalMultiIndexFromMultiIndex
_impl0
struct
lambda_0_
GetOriginalMultiIndexFromMultiIndex
{
const
Array
<
index_t
,
nDim
>&
multi_id
_ref
;
Array
<
index_t
,
nOriginalDim
>&
original_multi_id
_ref
;
const
Array
<
index_t
,
nDim
>&
multi_id
;
Array
<
index_t
,
nOriginalDim
>&
original_multi_id
;
__host__
__device__
constexpr
GetOriginalMultiIndexFromMultiIndex
_impl0
(
const
Array
<
index_t
,
nDim
>&
multi_id
,
Array
<
index_t
,
nOriginalDim
>&
original_multi_id
)
:
multi_id
_ref
(
multi_id
),
original_multi_id
_ref
(
original_multi_id
)
__host__
__device__
constexpr
lambda_0_
GetOriginalMultiIndexFromMultiIndex
(
const
Array
<
index_t
,
nDim
>&
multi_id
_
,
Array
<
index_t
,
nOriginalDim
>&
original_multi_id
_
)
:
multi_id
(
multi_id
_
),
original_multi_id
(
original_multi_id
_
)
{
}
template
<
index_t
IDim
>
__host__
__device__
constexpr
void
operator
()(
Number
<
IDim
>
)
const
{
constexpr
auto
original_dims_partial
=
std
::
get
<
IDim
>
(
std
::
tuple
<
OriginalDimMergeSeqs
...
>
{});
constexpr
auto
original_dims_partial
=
std
::
get
<
IDim
>
(
Type
::
mOriginalDimMergeSeqs
);
// get partial original-multi-id corresponding to this merged dimension
const
auto
original_multi_id_partial
=
OriginalTensorDesc
::
Extract
(
original_dims_partial
)
.
GetMultiIndexFrom1dIndex
(
multi_id
_ref
[
IDim
]);
.
GetMultiIndexFrom1dIndex
(
multi_id
[
IDim
]);
static_for
<
0
,
original_dims_partial
.
GetSize
(),
1
>
{}(
GetOriginalMultiIndexFromMultiIndex
_impl1
<
decltype
(
original_dims_partial
)
>
(
original_multi_id_partial
,
original_multi_id
_ref
));
lambda_1_
GetOriginalMultiIndexFromMultiIndex
<
decltype
(
original_dims_partial
)
>
(
original_multi_id_partial
,
original_multi_id
));
}
};
...
...
@@ -160,7 +135,7 @@ struct ConstantMergedTensorDescriptor
Array
<
index_t
,
nOriginalDim
>
original_multi_id
;
static_for
<
0
,
nDim
,
1
>
{}(
GetOriginalMultiIndexFromMultiIndex
_impl0
(
multi_id
,
original_multi_id
));
lambda_0_
GetOriginalMultiIndexFromMultiIndex
(
multi_id
,
original_multi_id
));
return
original_multi_id
;
}
...
...
@@ -174,7 +149,6 @@ struct ConstantMergedTensorDescriptor
return
OriginalTensorDesc
::
GetOffsetFromMultiIndex
(
original_multi_id
);
}
#endif
__host__
__device__
static
constexpr
index_t
GetOffsetFromMultiIndex
(
Array
<
index_t
,
nDim
>
multi_id
)
...
...
@@ -192,9 +166,9 @@ struct ConstantMergedTensorDescriptor
__host__
__device__
static
constexpr
Array
<
index_t
,
nDim
>
GetMultiIndexFrom1dIndex
(
index_t
id
)
{
constexpr
auto
dummy
_desc
=
make_ConstantTensorDescriptor_packed
(
GetLengths
());
constexpr
auto
packed
_desc
=
make_ConstantTensorDescriptor_packed
(
GetLengths
());
return
dummy
_desc
.
GetMultiIndexFrom1dIndex
(
id
);
return
packed
_desc
.
GetMultiIndexFrom1dIndex
(
id
);
}
};
...
...
src/include/ConstantTensorDescriptor.hip.hpp
View file @
109f1e90
...
...
@@ -48,26 +48,48 @@ struct ConstantTensorDescriptor
template
<
index_t
I
>
__host__
__device__
static
constexpr
index_t
GetLength
(
Number
<
I
>
)
{
return
Lengths
{}.
Get
(
Number
<
I
>
{});
return
Lengths
::
Get
(
Number
<
I
>
{});
}
template
<
index_t
I
>
__host__
__device__
static
constexpr
index_t
GetStride
(
Number
<
I
>
)
{
return
Strides
{}.
Get
(
Number
<
I
>
{});
return
Strides
::
Get
(
Number
<
I
>
{});
}
__host__
__device__
static
constexpr
bool
AreStridesNonAscending
()
struct
lambda_AreDimensionsContinuous
{
bool
flag
=
true
;
bool
&
is_continuous
;
static_for
<
0
,
nDim
-
1
,
1
>
{}([
&
](
auto
IDim
)
{
constexpr
auto
IDim_p1
=
Number
<
IDim
.
Get
()
+
1
>
{};
__host__
__device__
constexpr
lambda_AreDimensionsContinuous
(
bool
&
is_continuous_
)
:
is_continuous
(
is_continuous_
)
{
}
flag
=
flag
&&
(
GetLength
(
IDim
)
>=
GetLength
(
IDim_p1
));
});
template
<
index_t
IDim_
>
__host__
__device__
constexpr
void
operator
()(
Number
<
IDim_
>
)
const
{
constexpr
auto
IDim
=
Number
<
IDim_
>
{};
constexpr
auto
IDim_p1
=
Number
<
IDim_
+
1
>
{};
is_continuous
=
is_continuous
&&
(
GetStride
(
IDim
)
>=
GetStride
(
IDim_p1
)
&&
GetStride
(
IDim
)
==
GetStride
(
IDim_p1
)
*
GetLength
(
IDim_p1
));
}
};
return
flag
;
__host__
__device__
static
constexpr
bool
AreDimensionsContinuous
()
{
bool
is_continuous
=
true
;
static_for
<
0
,
nDim
-
1
,
1
>
{}(
lambda_AreDimensionsContinuous
(
is_continuous
));
return
is_continuous
;
}
__host__
__device__
static
constexpr
bool
IsPackedTensor
()
{
return
AreDimensionsContinuous
()
&&
GetStride
(
Number
<
nDim
-
1
>
{})
==
1
;
}
template
<
class
T
>
...
...
@@ -92,40 +114,24 @@ struct ConstantTensorDescriptor
return
align
.
Get
()
*
((
element_space_unaligned
+
align
.
Get
()
-
1
)
/
align
.
Get
());
}
#if 0
// emulate constexpr lambda
template
<
index_t
NSize
>
__host__ __device__ static constexpr index_t
GetOffsetFromMultiIndex(Array<index_t, NSize> multi_id)
struct
lambda_GetOffsetFromMultiIndex
{
static_assert(NSize == nDim, "wrong! Dimension not consistent");
index_t offset = 0;
Array
<
index_t
,
NSize
>&
multi_id
;
index_t
&
offset
;
static_for<0, nDim, 1>{}([&](auto IDim) {
constexpr index_t idim = IDim.Get();
offset += multi_id[idim] * GetStride(IDim);
});
return offset;
}
#else
template
<
index_t
NSize
>
struct
GetOffsetFromMultiIndex_impl
{
Array
<
index_t
,
NSize
>&
multi_id_ref
;
index_t
&
offset_ref
;
__host__
__device__
constexpr
GetOffsetFromMultiIndex_impl
(
Array
<
index_t
,
NSize
>&
multi_id
,
index_t
&
offset
)
:
multi_id_ref
(
multi_id
),
offset_ref
(
offset
)
__host__
__device__
constexpr
lambda_GetOffsetFromMultiIndex
(
Array
<
index_t
,
NSize
>&
multi_id_
,
index_t
&
offset_
)
:
multi_id
(
multi_id_
),
offset
(
offset_
)
{
}
template
<
index_t
IDim
>
__host__
__device__
constexpr
bool
operator
()(
Number
<
IDim
>
)
const
template
<
class
X
>
__host__
__device__
constexpr
void
operator
()(
X
IDim
)
const
{
offset_ref
+=
multi_id_ref
.
Get
(
Number
<
IDim
>
{})
*
Type
::
GetStride
(
Number
<
IDim
>
{});
return
true
;
offset
+=
multi_id
[
IDim
]
*
Type
::
GetStride
(
IDim
);
}
};
...
...
@@ -137,11 +143,10 @@ struct ConstantTensorDescriptor
index_t
offset
=
0
;
static_for
<
0
,
nDim
,
1
>
{}(
GetOffsetFromMultiIndex
_impl
<
NSize
>
(
multi_id
,
offset
));
static_for
<
0
,
nDim
,
1
>
{}(
lambda_
GetOffsetFromMultiIndex
<
NSize
>
(
multi_id
,
offset
));
return
offset
;
}
#endif
template
<
class
...
Is
>
__host__
__device__
static
constexpr
index_t
GetOffsetFromMultiIndex
(
Is
...
is
)
...
...
@@ -160,47 +165,27 @@ struct ConstantTensorDescriptor
multi_id
*
GetStrides
(),
mod_conv
::
plus
<
index_t
>
{},
Number
<
0
>
{});
}
#if 0
__host__ __device__ static constexpr Array<index_t, nDim> GetMultiIndexFrom1dIndex(index_t id)
{
Array<index_t, nDim> multi_id;
constexpr auto dummy_strides = calculate_tensor_strides_packed(GetLengths());
// calculate index in each of the dimensions in the order of their dimension
static_for<0, nDim - 1, 1>{}([&](auto IDim) {
constexpr index_t idim = IDim.Get();
constexpr index_t stride = dummy_strides.Get(Number<idim>{});
multi_id[idim] = id / stride;
id -= multi_id[idim] * stride;
});
multi_id[nDim - 1] = id / dummy_strides.Get(Number<nDim - 1>{});
return multi_id;
}
#else
struct
GetMultiIndexFrom1dIndex_impl
// emulate constexpr lambda
template
<
class
PackedStrides
>
struct
lambda_GetMultiIndexFrom1dIndex
{
using
DummyStrides
=
decltype
(
calculate_tensor_strides_packed
(
GetLengths
()));
index_t
&
id_ref
;
Array
<
index_t
,
nDim
>&
multi_id_ref
;
index_t
&
id
;
Array
<
index_t
,
nDim
>&
multi_id
;
__host__
__device__
constexpr
GetMultiIndexFrom1dIndex_impl
(
index_t
&
id
,
Array
<
index_t
,
nDim
>&
multi_id
)
:
id_ref
(
id
),
multi_id_ref
(
multi_id
)
__host__
__device__
constexpr
lambda_GetMultiIndexFrom1dIndex
(
index_t
&
id_
,
Array
<
index_t
,
nDim
>&
multi_id_
)
:
id
(
id_
),
multi_id
(
multi_id_
)
{
}
template
<
index_t
IDim
>
__host__
__device__
constexpr
bool
operator
()(
Number
<
IDim
>
)
const
template
<
class
IDim
_
>
__host__
__device__
constexpr
void
operator
()(
IDim
_
)
const
{
constexpr
index_t
stride
=
DummyStrides
::
Get
(
Number
<
IDim
>
{});
multi_id_ref
.
Set
(
Number
<
IDim
>
{},
id_ref
/
stride
);
id_ref
-=
multi_id_ref
.
Get
(
Number
<
IDim
>
{})
*
stride
;
return
true
;
constexpr
auto
IDim
=
IDim_
{};
constexpr
index_t
stride
=
PackedStrides
::
Get
(
IDim
);
multi_id
.
Set
(
IDim
,
id
/
stride
);
id
-=
multi_id
[
IDim
]
*
stride
;
}
};
...
...
@@ -208,27 +193,15 @@ struct ConstantTensorDescriptor
{
Array
<
index_t
,
nDim
>
multi_id
;
constexpr
auto
dummy_strides
=
calculate_tensor_strides_packed
(
GetLengths
());
using
PackedStrides
=
decltype
(
calculate_tensor_strides_packed
(
GetLengths
())
)
;
// calculate index in each of the dimensions in the order of their dimension
static_for
<
0
,
nDim
-
1
,
1
>
{}(
GetMultiIndexFrom1dIndex_impl
(
id
,
multi_id
));
index_t
itmp
=
id
/
dummy_strides
.
Get
(
Number
<
nDim
-
1
>
{});
static_for
<
0
,
nDim
-
1
,
1
>
{}(
lambda_GetMultiIndexFrom1dIndex
<
PackedStrides
>
(
id
,
multi_id
));
multi_id
.
Set
(
Number
<
nDim
-
1
>
{},
i
tmp
);
multi_id
.
Set
(
Number
<
nDim
-
1
>
{},
i
d
/
PackedStrides
::
Get
(
Number
<
nDim
-
1
>
{})
);
return
multi_id
;
}
#endif
#if 0
// return type is Sequence<...>
template<index_t Id>
__host__ __device__ static constexpr auto GetMultiIndexFrom1dIndex(Number<Id>)
{
return inclusive_scan_sequence(f_impl, GetStrides(), Number<Id>{});
}
#endif
__host__
__device__
static
constexpr
auto
GetOriginalMultiIndexFromMultiIndex
(
Array
<
index_t
,
nDim
>
multi_id
)
...
...
@@ -236,9 +209,10 @@ struct ConstantTensorDescriptor
return
multi_id
;
}
// This function doesn't do carry check on the highest dimension, for performance reason.
// It is the user's responsibility to make sure the result "new_mutli_id" is not out-of-bound
// on the highest dimension
// This function doesn't do carry check on the highest dimension for positive stepping (or
// borrow check on the lowest dimension for negative stepping) , for performance reason. It is
// the user's responsibility to make sure the result "new_mutli_id" is not out-of-bound on the
// highest dimension for positive stepping (or on the lowest dimension for negative stepping)
template
<
bool
PositiveDirection
>
__host__
__device__
static
Array
<
index_t
,
nDim
>
UpdateMultiIndexGivenStepSizeOf1dIndex
(
Array
<
index_t
,
nDim
>
old_multi_id
,
...
...
@@ -262,14 +236,14 @@ struct ConstantTensorDescriptor
if
(
carry
)
{
++
new_multi_id
[
idim
]
;
++
new_multi_id
(
idim
)
;
}
carry
=
false
;
if
(
new_multi_id
[
idim
]
>=
GetLength
(
IDim
))
{
new_multi_id
[
idim
]
-=
GetLength
(
IDim
);
new_multi_id
(
idim
)
-=
GetLength
(
IDim
);
carry
=
true
;
}
});
...
...
@@ -288,14 +262,14 @@ struct ConstantTensorDescriptor
if
(
borrow
)
{
--
new_multi_id
[
idim
]
;
--
new_multi_id
(
idim
)
;
}
borrow
=
false
;
if
(
new_multi_id
[
idim
]
<
GetLength
(
IDim
))
{
new_multi_id
[
idim
]
+=
GetLength
(
IDim
);
new_multi_id
(
idim
)
+=
GetLength
(
IDim
);
borrow
=
true
;
}
});
...
...
@@ -382,15 +356,7 @@ struct ConstantTensorDescriptor
return
ConstantTensorDescriptor
<
decltype
(
new_lengths
),
decltype
(
new_strides
)
>
{};
}
template
<
index_t
Threashold
,
index_t
Delta
>
struct
f_unfold_impl
{
__host__
__device__
constexpr
index_t
operator
()(
index_t
x
)
const
{
return
x
>
Threashold
?
x
-
Delta
:
x
;
}
};
// this function unfold dimension [FirstUnfoldDim, ..., LastUnfoldDim] into 1 dimension
template
<
index_t
FirstUnfoldDim
,
index_t
LastUnfoldDim
>
__host__
__device__
static
constexpr
auto
Unfold
(
Number
<
FirstUnfoldDim
>
,
Number
<
LastUnfoldDim
>
)
{
...
...
@@ -398,24 +364,6 @@ struct ConstantTensorDescriptor
FirstUnfoldDim
<=
LastUnfoldDim
,
"wrong! should have FirstUnfoldDim <= LastUnfoldDim!"
);
#if 0 // cannot compile: compiler complain about constexpr
// dimensions to be unfold need to be in descending order (w.r.t. strides), and need to be
// packed in memory, otherwise, unfolding is invalid
static_for<FirstUnfoldDim, LastUnfoldDim, 1>{}([&](auto IDim_) {
constexpr auto IDim = decltype(IDim_){};
constexpr auto IDim_p1 = IDim + Number<1>{};
// check stride
static_assert(
GetStride(IDim) >= GetStride(IDim_p1),
"wrong! dimensions to be unfolded need to be in descending order w.r.t strides");
// check if packed
static_assert(GetStride(IDim_p1) * GetLength(IDim_p1) == GetStride(IDim),
"wrong! dimensions to be unfolded need to be packed");
});
#endif
// left and right
constexpr
auto
left
=
typename
arithmetic_sequence_gen
<
0
,
FirstUnfoldDim
,
1
>::
SeqType
{};
constexpr
auto
middle
=
...
...
@@ -423,6 +371,9 @@ struct ConstantTensorDescriptor
constexpr
auto
right
=
typename
arithmetic_sequence_gen
<
LastUnfoldDim
+
1
,
GetNumOfDimension
(),
1
>::
SeqType
{};
// dimensions to be unfolded need to be continuous
static_assert
(
Type
::
Extract
(
middle
).
AreDimensionsContinuous
(),
"wrong! not unfoldable"
);
// unfolded length, stride
constexpr
index_t
unfold_length
=
accumulate_on_sequence
(
GetLengths
().
Extract
(
middle
),
mod_conv
::
multiplies
<
index_t
>
{},
Number
<
1
>
{});
...
...
@@ -446,16 +397,16 @@ struct ConstantTensorDescriptor
template
<
class
MapNew2Old
>
__host__
__device__
static
constexpr
auto
ReorderGivenNew2Old
(
MapNew2Old
)
{
return
ConstantTensorDescriptor
<
decltype
(
Lengths
{}.
ReorderGivenNew2Old
(
MapNew2Old
{})),
decltype
(
Strides
{}.
ReorderGivenNew2Old
(
MapNew2Old
{}))
>
{};
return
ConstantTensorDescriptor
<
decltype
(
Lengths
::
ReorderGivenNew2Old
(
MapNew2Old
{})),
decltype
(
Strides
::
ReorderGivenNew2Old
(
MapNew2Old
{}))
>
{};
}
#if 0 // require sequence_sort, which is not implemented yet
template <class MapOld2New>
__host__ __device__ static constexpr auto ReorderGivenOld2New(MapOld2New)
{
return ConstantTensorDescriptor<decltype(Lengths
{}.
ReorderGivenOld2New(MapOld2New{})),
decltype(Strides
{}.
ReorderGivenOld2New(MapOld2New{}))>{}
return ConstantTensorDescriptor<decltype(Lengths
::
ReorderGivenOld2New(MapOld2New{})),
decltype(Strides
::
ReorderGivenOld2New(MapOld2New{}))>{}
}
#endif
};
...
...
src/include/Sequence.hip.hpp
View file @
109f1e90
...
...
@@ -2,6 +2,9 @@
#include "integral_constant.hip.hpp"
#include "functional.hip.hpp"
template
<
class
Seq
>
struct
is_valid_sequence_map
;
template
<
index_t
...
Is
>
struct
Sequence
{
...
...
@@ -16,7 +19,23 @@ struct Sequence
{
static_assert
(
I
<
mSize
,
"wrong! I too large"
);
// the last dummy element is to prevent compiler complain about empty Sequence
// the last dummy element is to prevent compiler complain about empty array, when mSize = 0
const
index_t
mData
[
mSize
+
1
]
=
{
Is
...,
0
};
return
mData
[
I
];
}
template
<
index_t
I
>
__host__
__device__
constexpr
index_t
operator
[](
Number
<
I
>
)
const
{
static_assert
(
I
<
mSize
,
"wrong! I too large"
);
const
index_t
mData
[
mSize
+
1
]
=
{
Is
...,
0
};
return
mData
[
I
];
}
// make sure I is constepxr
__host__
__device__
constexpr
index_t
operator
[](
index_t
I
)
const
{
const
index_t
mData
[
mSize
+
1
]
=
{
Is
...,
0
};
return
mData
[
I
];
}
...
...
@@ -24,24 +43,24 @@ struct Sequence
template
<
index_t
...
IRs
>
__host__
__device__
static
constexpr
auto
ReorderGivenNew2Old
(
Sequence
<
IRs
...
>
/*new2old*/
)
{
#if 0 // require sequence_sort, which is not implemented yet
static_assert(is_same<sequence_sort<Sequence<IRs...>>::SortedSeqType,
arithmetic_sequence_gen<0, mSize, 1>::SeqType>::value,
"wrong! invalid new2old map");
#endif
static_assert
(
sizeof
...(
Is
)
==
sizeof
...(
IRs
),
"wrong! reorder map should have the same size as Sequence to be rerodered"
);
static_assert
(
is_valid_sequence_map
<
Sequence
<
IRs
...
>>::
value
,
"wrong! invalid reorder map"
);
return
Sequence
<
Type
{}.
Get
(
Number
<
IRs
>
{})...
>
{};
return
Sequence
<
Type
::
Get
(
Number
<
IRs
>
{})...
>
{};
}
#if 0 // require sequence_sort, which is not implemented yet
template <class MapOld2New>
__host__ __device__ static constexpr auto ReorderGivenOld2New(MapOld2New /*old2new*/)
{
#if 0
static_assert(is_same<sequence_sort<MapOld2New>::SortedSeqType,
arithmetic_sequence_gen<0, mSize, 1>::SeqType>::value,
"wrong! invalid old2new map");
#endif
static_assert(sizeof...(Is) == MapOld2New::GetSize(),
"wrong! reorder map should have the same size as Sequence to be rerodered");
static_assert(is_valid_sequence_map<MapOld2New>::value,
"wrong! invalid reorder map");
constexpr auto map_new2old = typename sequence_map_inverse<MapOld2New>::SeqMapType{};
return ReorderGivenNew2Old(map_new2old);
...
...
@@ -87,13 +106,13 @@ struct Sequence
template
<
index_t
...
Ns
>
__host__
__device__
static
constexpr
auto
Extract
(
Number
<
Ns
>
...)
{
return
Sequence
<
Type
{}.
Get
(
Number
<
Ns
>
{})...
>
{};
return
Sequence
<
Type
::
Get
(
Number
<
Ns
>
{})...
>
{};
}
template
<
index_t
...
Ns
>
__host__
__device__
static
constexpr
auto
Extract
(
Sequence
<
Ns
...
>
)
{
return
Sequence
<
Type
{}.
Get
(
Number
<
Ns
>
{})...
>
{};
return
Sequence
<
Type
::
Get
(
Number
<
Ns
>
{})...
>
{};
}
template
<
index_t
I
,
index_t
X
>
...
...
@@ -297,6 +316,7 @@ struct sequence_map_inverse<Sequence<Is...>>
};
#endif
template
<
class
Seq
>
struct
is_valid_sequence_map
{
...
...
@@ -322,11 +342,6 @@ __host__ __device__ constexpr auto operator-(Sequence<Xs...> seq_x, Sequence<Ys.
{
static_assert
(
sizeof
...(
Xs
)
==
sizeof
...(
Ys
),
"wrong! inconsistent size"
);
#if 0
static_for<0, seq_x.GetSize(), 1>{}(
[&](auto I) { static_assert(seq_x.Get(I) >= seq_y.Get(I), "wrong! going to undeflow"); });
#endif
return
Sequence
<
(
Xs
-
Ys
)...
>
{};
}
...
...
@@ -363,15 +378,6 @@ __host__ __device__ constexpr auto operator+(Sequence<Xs...>, Number<Y>)
template
<
index_t
...
Xs
,
index_t
Y
>
__host__
__device__
constexpr
auto
operator
-
(
Sequence
<
Xs
...
>
,
Number
<
Y
>
)
{
#if 0 // TODO: turn it on. Doesn't compile
constexpr auto seq_x = Sequence<Xs...>{};
static_for<0, sizeof...(Xs), 1>{}([&](auto Iter) {
constexpr auto I = decltype(Iter){};
static_assert(seq_x.Get(I) >= Y, "wrong! going to underflow");
});
#endif
return
Sequence
<
(
Xs
-
Y
)...
>
{};
}
...
...
@@ -404,13 +410,6 @@ __host__ __device__ constexpr auto operator-(Number<Y>, Sequence<Xs...>)
{
constexpr
auto
seq_x
=
Sequence
<
Xs
...
>
{};
#if 0
static_for<0, sizeof...(Xs), 1>{}([&](auto Iter) {
constexpr auto I = decltype(Iter){};
static_assert(seq_x.Get(I) <= Y, "wrong! going to underflow");
});
#endif
return
Sequence
<
(
Y
-
Xs
)...
>
{};
}
...
...
@@ -482,25 +481,6 @@ __host__ __device__ constexpr auto inclusive_scan_sequence(Seq, Reduce, Number<I
return
reverse_inclusive_scan_sequence
(
Seq
{}.
Reverse
(),
Reduce
{},
Number
<
Init
>
{}).
Reverse
();
}
template
<
class
Seq
>
struct
accumulate_on_sequence_impl
{
template
<
class
IDim
>
__host__
__device__
constexpr
index_t
operator
()(
IDim
)
const
{
return
Seq
{}.
Get
(
IDim
{});
}
};
template
<
class
Seq
,
class
Reduce
,
index_t
I
>
__host__
__device__
constexpr
index_t
accumulate_on_sequence
(
Seq
,
Reduce
,
Number
<
I
>
/*initial_value*/
)
{
constexpr
index_t
a
=
static_const_reduce_n
<
Seq
::
mSize
>
{}(
accumulate_on_sequence_impl
<
Seq
>
{},
Reduce
{});
return
Reduce
{}(
a
,
I
);
}
template
<
index_t
...
Is
>
__host__
__device__
constexpr
auto
Sequence
<
Is
...
>::
PopFront
()
{
...
...
src/include/base.hip.hpp
0 → 100644
View file @
109f1e90
#pragma once
__device__
index_t
get_thread_local_1d_id
()
{
return
threadIdx
.
x
;
}
__device__
index_t
get_block_1d_id
()
{
return
blockIdx
.
x
;
}
template
<
class
T1
,
class
T2
>
struct
is_same
{
static
constexpr
bool
value
=
false
;
};
template
<
class
T
>
struct
is_same
<
T
,
T
>
{
static
constexpr
bool
value
=
true
;
};
template
<
class
X
,
class
Y
>
__host__
__device__
constexpr
bool
is_same_type
(
X
,
Y
)
{
return
is_same
<
X
,
Y
>::
value
;
}
namespace
mod_conv
{
// namespace mod_conv
template
<
class
T
,
T
s
>
struct
scales
{
__host__
__device__
constexpr
T
operator
()(
T
a
)
const
{
return
s
*
a
;
}
};
template
<
class
T
>
struct
plus
{
__host__
__device__
constexpr
T
operator
()(
T
a
,
T
b
)
const
{
return
a
+
b
;
}
};
template
<
class
T
>
struct
minus
{
__host__
__device__
constexpr
T
operator
()(
T
a
,
T
b
)
const
{
return
a
-
b
;
}
};
template
<
class
T
>
struct
multiplies
{
__host__
__device__
constexpr
T
operator
()(
T
a
,
T
b
)
const
{
return
a
*
b
;
}
};
template
<
class
T
>
struct
integer_divide_ceiler
{
__host__
__device__
constexpr
T
operator
()(
T
a
,
T
b
)
const
{
static_assert
(
is_same
<
T
,
index_t
>::
value
||
is_same
<
T
,
int
>::
value
,
"wrong type"
);
return
(
a
+
b
-
1
)
/
b
;
}
};
template
<
class
T
>
__host__
__device__
constexpr
T
integer_divide_ceil
(
T
a
,
T
b
)
{
static_assert
(
is_same
<
T
,
index_t
>::
value
||
is_same
<
T
,
int
>::
value
,
"wrong type"
);
return
(
a
+
b
-
1
)
/
b
;
}
template
<
class
T
>
__host__
__device__
constexpr
T
max
(
T
x
,
T
y
)
{
return
x
>
y
?
x
:
y
;
}
template
<
class
T
,
class
...
Ts
>
__host__
__device__
constexpr
T
max
(
T
x
,
Ts
...
xs
)
{
static_assert
(
sizeof
...(
xs
)
>
0
,
"not enough argument"
);
auto
y
=
max
(
xs
...);
static_assert
(
is_same
<
decltype
(
y
),
T
>::
value
,
"not the same type"
);
return
x
>
y
?
x
:
y
;
}
template
<
class
T
>
__host__
__device__
constexpr
T
min
(
T
x
,
T
y
)
{
return
x
<
y
?
x
:
y
;
}
template
<
class
T
,
class
...
Ts
>
__host__
__device__
constexpr
T
min
(
T
x
,
Ts
...
xs
)
{
static_assert
(
sizeof
...(
xs
)
>
0
,
"not enough argument"
);
auto
y
=
min
(
xs
...);
static_assert
(
is_same
<
decltype
(
y
),
T
>::
value
,
"not the same type"
);
return
x
<
y
?
x
:
y
;
}
// this is wrong
// TODO: implement correct least common multiple, instead of calling max()
template
<
class
T
,
class
...
Ts
>
__host__
__device__
constexpr
T
lcm
(
T
x
,
Ts
...
xs
)
{
return
max
(
x
,
xs
...);
}
}
// namespace mod_conv
src/include/blockwise_generic_tensor_slice_op.hip.hpp
View file @
109f1e90
...
...
@@ -122,7 +122,7 @@ struct BlockwiseGenericTensorSliceCopy_v1
constexpr
auto
src_partial_original_desc
=
SrcDesc
::
GetOriginalTensorDescriptor
().
Extract
(
src_partial_original_dims
);
mThreadSrcPartialOffsets
[
idim
]
=
src_partial_original_desc
.
GetOffsetFromMultiIndex
(
mThreadSrcPartialOffsets
(
idim
)
=
src_partial_original_desc
.
GetOffsetFromMultiIndex
(
extract_array
(
mThreadSrcOriginalMultiId
,
src_partial_original_dims
));
});
...
...
@@ -136,7 +136,7 @@ struct BlockwiseGenericTensorSliceCopy_v1
constexpr
auto
dst_partial_original_desc
=
DstDesc
::
GetOriginalTensorDescriptor
().
Extract
(
dst_partial_original_dims
);
mThreadDstPartialOffsets
[
idim
]
=
dst_partial_original_desc
.
GetOffsetFromMultiIndex
(
mThreadDstPartialOffsets
(
idim
)
=
dst_partial_original_desc
.
GetOffsetFromMultiIndex
(
extract_array
(
mThreadDstOriginalMultiId
,
dst_partial_original_dims
));
});
...
...
@@ -206,18 +206,16 @@ struct BlockwiseGenericTensorSliceCopy_v1
#if 0
constexpr auto repeat_multi_id = sequence2array(decltype(repeat_multi_id_){});
const auto src_thread_data_multi_id_begin =
repeat_multi_id * data_per_cluster_per_dims; // cannot not constexpr, why?
const auto src_thread_data_multi_id_begin = repeat_multi_id * data_per_cluster_per_dims;
const auto clipboard_data_multi_id_begin =
repeat_multi_id * thread_sub_tensor_lengths; // cannot not constexpr, why?
const auto clipboard_data_multi_id_begin = repeat_multi_id * thread_sub_tensor_lengths;
const index_t src_offset =
SrcDesc{}.GetOffsetFromMultiIndex(
src_thread_data_multi_id_begin);
// cannot not constexpr, why?
const index_t src_offset =
SrcDesc{}.GetOffsetFromMultiIndex(
src_thread_data_multi_id_begin);
const index_t clipboard_offset =
thread_tensor_desc.GetOffsetFromMultiIndex(
clipboard_data_multi_id_begin);
// cannot not constexpr, why?
#else
const index_t clipboard_offset =
thread_tensor_desc.GetOffsetFromMultiIndex(
clipboard_data_multi_id_begin);
#else
// HIP compiler performs better with these codes
constexpr
auto
repeat_multi_id
=
decltype
(
repeat_multi_id_
){};
constexpr
auto
src_thread_data_multi_id_begin
=
...
...
@@ -261,18 +259,15 @@ struct BlockwiseGenericTensorSliceCopy_v1
#if 0
constexpr auto repeat_multi_id = sequence2array(decltype(repeat_multi_id_){});
const auto clipboard_data_multi_id_begin =
repeat_multi_id * thread_sub_tensor_lengths; // cannot not constexpr, why?
const auto clipboard_data_multi_id_begin = repeat_multi_id * thread_sub_tensor_lengths;
const auto dst_data_multi_id_begin =
repeat_multi_id * data_per_cluster_per_dims; // cannot not constexpr, why?
const auto dst_data_multi_id_begin = repeat_multi_id * data_per_cluster_per_dims;
const index_t clipboard_offset =
thread_tensor_desc.GetOffsetFromMultiIndex(
clipboard_data_multi_id_begin);
// cannot not constexpr, why?
const index_t clipboard_offset =
thread_tensor_desc.GetOffsetFromMultiIndex(
clipboard_data_multi_id_begin);
const index_t dst_offset = DstDesc{}.GetOffsetFromMultiIndex(
dst_data_multi_id_begin); // cannot not constexpr, why?
#else
const index_t dst_offset = DstDesc{}.GetOffsetFromMultiIndex(dst_data_multi_id_begin);
#else
// HIP compiler performs better with these codes
constexpr
auto
repeat_multi_id
=
decltype
(
repeat_multi_id_
){};
constexpr
auto
clipboard_data_multi_id_begin
=
...
...
@@ -343,33 +338,12 @@ struct BlockwiseGenericTensorSliceCopy_v1
src_partial_original_desc
.
UpdateMultiIndexGivenStepSizeOf1dIndex
(
old_src_partial_original_multi_id
,
StepSize
,
direction
);
#if 0
{
if(debug_flag && get_block_1d_id() == 0)
{
printf("id %5u %5u: "
"old_src_partial_original_multi_id %u %u %u, "
"new_src_partial_original_multi_id %u %u %u, "
"mThreadSrcOffset %u, mThreadDstOffset %u \n",
get_block_1d_id(),
get_thread_local_1d_id(),
old_src_partial_original_multi_id[0],
old_src_partial_original_multi_id[1],
old_src_partial_original_multi_id[2],
new_src_partial_original_multi_id[0],
new_src_partial_original_multi_id[1],
new_src_partial_original_multi_id[2]
);
}
}
#endif
// update "mThreadSrcOriginalMultiId"
static_for
<
0
,
decltype
(
src_partial_original_dims
)
::
GetSize
(),
1
>
{}([
&
](
auto
I_
)
{
constexpr
auto
I
=
decltype
(
I_
){};
constexpr
index_t
idim_original
=
src_partial_original_dims
.
Get
(
I
);
mThreadSrcOriginalMultiId
[
idim_original
]
=
mThreadSrcOriginalMultiId
(
idim_original
)
=
new_src_partial_original_multi_id
[
I
.
Get
()];
});
...
...
@@ -381,7 +355,7 @@ struct BlockwiseGenericTensorSliceCopy_v1
new_src_partial_original_multi_id
);
// update "mThreadSrcPartialOffsets"
mThreadSrcPartialOffsets
[
idim
]
=
new_src_partial_offset
;
mThreadSrcPartialOffsets
(
idim
)
=
new_src_partial_offset
;
// update "mThreadSrcOffset", do "+" before "-" to avoid underflow
mThreadSrcOffset
=
(
mThreadSrcOffset
+
new_src_partial_offset
)
-
old_src_partial_offset
;
...
...
@@ -401,15 +375,15 @@ struct BlockwiseGenericTensorSliceCopy_v1
static_if
<
PositiveDirection
>
{}([
&
](
auto
fwd
)
{
mThreadSrcOffset
+=
StepSize
*
fwd
(
SrcDesc
{}).
GetStride
(
IDim
);
mThreadSrcOriginalMultiId
[
idim_original
]
+=
StepSize
;
mThreadSrcOriginalMultiId
(
idim_original
)
+=
StepSize
;
mThreadSrcPartialOffsets
[
idim
]
+=
StepSize
*
fwd
(
SrcDesc
{}).
GetStride
(
IDim
);
mThreadSrcPartialOffsets
(
idim
)
+=
StepSize
*
fwd
(
SrcDesc
{}).
GetStride
(
IDim
);
}).
Else
([
&
](
auto
fwd
)
{
mThreadSrcOffset
-=
StepSize
*
fwd
(
SrcDesc
{}).
GetStride
(
IDim
);
mThreadSrcOriginalMultiId
[
idim_original
]
-=
StepSize
;
mThreadSrcOriginalMultiId
(
idim_original
)
-=
StepSize
;
mThreadSrcPartialOffsets
[
idim
]
-=
StepSize
*
fwd
(
SrcDesc
{}).
GetStride
(
IDim
);
mThreadSrcPartialOffsets
(
idim
)
-=
StepSize
*
fwd
(
SrcDesc
{}).
GetStride
(
IDim
);
});
});
}
...
...
src/include/common.hip.hpp
View file @
109f1e90
#pragma once
#include "base.hip.hpp"
#include "vector_type.hip.hpp"
#include "integral_constant.hip.hpp"
#include "Sequence.hip.hpp"
...
...
@@ -10,109 +11,3 @@
#if USE_AMD_INLINE_ASM
#include "amd_inline_asm.hip.hpp"
#endif
__device__
index_t
get_thread_local_1d_id
()
{
return
threadIdx
.
x
;
}
__device__
index_t
get_block_1d_id
()
{
return
blockIdx
.
x
;
}
template
<
class
T1
,
class
T2
>
struct
is_same
{
static
constexpr
bool
value
=
false
;
};
template
<
class
T
>
struct
is_same
<
T
,
T
>
{
static
constexpr
bool
value
=
true
;
};
template
<
class
X
,
class
Y
>
__host__
__device__
constexpr
bool
is_same_type
(
X
,
Y
)
{
return
is_same
<
X
,
Y
>::
value
;
}
namespace
mod_conv
{
// namespace mod_conv
template
<
class
T
,
T
s
>
struct
scales
{
__host__
__device__
constexpr
T
operator
()(
T
a
)
const
{
return
s
*
a
;
}
};
template
<
class
T
>
struct
plus
{
__host__
__device__
constexpr
T
operator
()(
T
a
,
T
b
)
const
{
return
a
+
b
;
}
};
template
<
class
T
>
struct
multiplies
{
__host__
__device__
constexpr
T
operator
()(
T
a
,
T
b
)
const
{
return
a
*
b
;
}
};
template
<
class
T
>
struct
integer_divide_ceiler
{
__host__
__device__
constexpr
T
operator
()(
T
a
,
T
b
)
const
{
static_assert
(
is_same
<
T
,
index_t
>::
value
||
is_same
<
T
,
int
>::
value
,
"wrong type"
);
return
(
a
+
b
-
1
)
/
b
;
}
};
template
<
class
T
>
__host__
__device__
constexpr
T
integer_divide_ceil
(
T
a
,
T
b
)
{
static_assert
(
is_same
<
T
,
index_t
>::
value
||
is_same
<
T
,
int
>::
value
,
"wrong type"
);
return
(
a
+
b
-
1
)
/
b
;
}
template
<
class
T
>
__host__
__device__
constexpr
T
max
(
T
x
,
T
y
)
{
return
x
>
y
?
x
:
y
;
}
template
<
class
T
,
class
...
Ts
>
__host__
__device__
constexpr
T
max
(
T
x
,
Ts
...
xs
)
{
static_assert
(
sizeof
...(
xs
)
>
0
,
"not enough argument"
);
auto
y
=
max
(
xs
...);
static_assert
(
is_same
<
decltype
(
y
),
T
>::
value
,
"not the same type"
);
return
x
>
y
?
x
:
y
;
}
template
<
class
T
>
__host__
__device__
constexpr
T
min
(
T
x
,
T
y
)
{
return
x
<
y
?
x
:
y
;
}
template
<
class
T
,
class
...
Ts
>
__host__
__device__
constexpr
T
min
(
T
x
,
Ts
...
xs
)
{
static_assert
(
sizeof
...(
xs
)
>
0
,
"not enough argument"
);
auto
y
=
min
(
xs
...);
static_assert
(
is_same
<
decltype
(
y
),
T
>::
value
,
"not the same type"
);
return
x
<
y
?
x
:
y
;
}
// this is wrong
// TODO: implement correct least common multiple, instead of calling max()
template
<
class
T
,
class
...
Ts
>
__host__
__device__
constexpr
T
least_common_multiple
(
T
x
,
Ts
...
xs
)
{
return
max
(
x
,
xs
...);
}
}
// namespace mod_conv
src/include/functional.hip.hpp
View file @
109f1e90
...
...
@@ -19,18 +19,7 @@ struct swallow
}
};
#if 0
template<class F>
__host__ __device__ constexpr auto unpacker(F f)
{
return [=](auto xs_array){ f(xs...); };
}
#endif
// Emulate compile time if statement for C++14
// Get the idea from
// "https://baptiste-wicht.com/posts/2015/07/simulate-static_if-with-c11c14.html"
// TODO: use if constexpr, when C++17 is supported
// Emulate if constexpr
template
<
bool
Predicate
>
struct
static_if
{
...
...
@@ -81,28 +70,3 @@ struct static_if<false>
return
Type
{};
}
};
template
<
index_t
NLoop
>
struct
static_const_reduce_n
{
// signature of F: F(Number<I>)
template
<
class
F
,
class
Reduce
>
__host__
__device__
constexpr
auto
operator
()(
F
f
,
Reduce
r
)
const
{
static_assert
(
NLoop
>
1
,
"out-of-range"
);
constexpr
auto
a
=
f
(
Number
<
NLoop
-
1
>
{});
auto
b
=
static_const_reduce_n
<
NLoop
-
1
>
{}(
f
,
r
);
// TODO: cannot use constexpr here, weird
return
r
(
a
,
b
);
}
};
template
<
>
struct
static_const_reduce_n
<
1
>
{
template
<
class
F
,
class
Reduce
>
__host__
__device__
constexpr
auto
operator
()(
F
f
,
Reduce
)
const
{
return
f
(
Number
<
0
>
{});
}
};
src/include/functional2.hip.hpp
View file @
109f1e90
...
...
@@ -2,29 +2,16 @@
#include "functional.hip.hpp"
#include "Sequence.hip.hpp"
#if 0
template <index_t Iter, index_t Remaining, index_t Increment>
struct static_for_impl
{
template <class F>
constexpr __host__ __device__ void operator()(F f) const
{
static_assert(Remaining % Increment == 0, "wrong! Remaining % Increment != 0");
static_assert(Increment <= Remaining, "will go out-of-range");
f(Number<Iter>{});
static_for_impl<Iter + Increment, Remaining - Increment, Increment>{}(f);
}
};
template
<
class
>
struct
static_for_impl
;
template <index_t
Iter, index_t Increment
>
struct static_for_impl<
Iter, 0, Increment
>
template
<
index_t
...
Is
>
struct
static_for_impl
<
Sequence
<
Is
...
>
>
{
template
<
class
F
>
constexpr
__host__ __device__ void operator()(F) const
__host__
__device__
constexpr
void
operator
()(
F
f
)
const
{
// no work left, just return
return;
swallow
{(
f
(
Number
<
Is
>
{}),
0
)...};
}
};
...
...
@@ -33,48 +20,42 @@ template <index_t NBegin, index_t NEnd, index_t Increment>
struct
static_for
{
template
<
class
F
>
constexpr
__host__ __device__ void operator()(F f) const
__host__
__device__
constexpr
void
operator
()(
F
f
)
const
{
static_assert
(
NBegin
<=
NEnd
,
"wrongs! should have NBegin <= NEnd"
);
static_assert
((
NEnd
-
NBegin
)
%
Increment
==
0
,
"Wrong! should satisfy (NEnd - NBegin) % Increment == 0"
);
#if 0
static_if<(NBegin < NEnd)>{}(
[&](auto fwd) { static_for_impl<NBegin, NEnd - NBegin, fwd(Increment)>{}(f); });
#else
static_for_impl<NBegin, NEnd - NBegin, Increment>{}(f);
#endif
static_for_impl
<
typename
arithmetic_sequence_gen
<
NBegin
,
NEnd
,
Increment
>::
SeqType
>
{}(
f
);
}
};
#else
template
<
class
>
struct
static_for_impl
;
template
<
index_t
...
Is
>
struct
static_for_impl
<
Sequence
<
Is
...
>>
template
<
class
Seq
,
class
Reduce
>
struct
lambda_accumulate_on_sequence
{
template
<
class
F
>
__host__
__device__
constexpr
void
operator
()(
F
f
)
const
const
Reduce
&
f
;
index_t
&
result
;
__host__
__device__
constexpr
lambda_accumulate_on_sequence
(
const
Reduce
&
f_
,
index_t
&
result_
)
:
f
(
f_
),
result
(
result_
)
{
swallow
{(
f
(
Number
<
Is
>
{}),
0
)...};
}
template
<
class
IDim
>
__host__
__device__
constexpr
index_t
operator
()(
IDim
)
const
{
return
result
=
f
(
result
,
Seq
::
Get
(
IDim
{}));
}
};
// F signature: F(Number<Iter>)
template
<
index_t
NBegin
,
index_t
NEnd
,
index_t
Increment
>
struct
static_for
template
<
class
Seq
,
class
Reduce
,
index_t
Init
>
__host__
__device__
constexpr
index_t
accumulate_on_sequence
(
Seq
,
Reduce
f
,
Number
<
Init
>
/*initial_value*/
)
{
template
<
class
F
>
__host__
__device__
constexpr
void
operator
()(
F
f
)
const
{
static_assert
(
NBegin
<=
NEnd
,
"wrongs! should have NBegin <= NEnd"
);
index_t
result
=
Init
;
static_assert
((
NEnd
-
NBegin
)
%
Increment
==
0
,
"Wrong! should satisfy (NEnd - NBegin) % Increment == 0"
);
static_for
<
0
,
Seq
::
mSize
,
1
>
{}(
lambda_accumulate_on_sequence
<
Seq
,
Reduce
>
(
f
,
result
));
static_for_impl
<
typename
arithmetic_sequence_gen
<
NBegin
,
NEnd
,
Increment
>::
SeqType
>
{}(
f
);
}
};
#endif
return
result
;
}
src/include/functional3.hip.hpp
View file @
109f1e90
...
...
@@ -11,7 +11,7 @@ struct static_ford_impl
// F signature: F(Sequence<...> multi_id)
// CurrentMultiIndex: Sequence<...>
template
<
class
F
,
class
CurrentMultiIndex
>
__host__
__device__
void
operator
()(
F
f
,
CurrentMultiIndex
)
const
__host__
__device__
constexpr
void
operator
()(
F
f
,
CurrentMultiIndex
)
const
{
static_assert
(
RemainLengths
::
GetSize
()
>
0
,
"wrong! should not get here"
);
...
...
@@ -28,7 +28,7 @@ struct static_ford_impl<Sequence<>>
// F signature: F(Sequence<...> multi_id)
// CurrentMultiIndex: Sequence<...>
template
<
class
F
,
class
CurrentMultiIndex
>
__host__
__device__
void
operator
()(
F
f
,
CurrentMultiIndex
)
const
__host__
__device__
constexpr
void
operator
()(
F
f
,
CurrentMultiIndex
)
const
{
f
(
CurrentMultiIndex
{});
}
...
...
@@ -40,7 +40,7 @@ struct static_ford
{
// F signature: F(Sequence<...> multi_id)
template
<
class
F
>
__host__
__device__
void
operator
()(
F
f
)
const
__host__
__device__
constexpr
void
operator
()(
F
f
)
const
{
static_assert
(
Lengths
::
GetSize
()
>
0
,
"wrong! Lengths is empty"
);
...
...
@@ -55,7 +55,7 @@ struct ford_impl
// CurrentMultiIndex: Array<...>
// RemainLengths: Sequence<...>
template
<
class
F
,
class
CurrentMultiIndex
,
class
RemainLengths
>
__host__
__device__
void
__host__
__device__
constexpr
void
operator
()(
F
f
,
CurrentMultiIndex
current_multi_id
,
RemainLengths
)
const
{
static_assert
(
RemainLengths
::
GetSize
()
==
RemainDim
,
"wrong!"
);
...
...
@@ -77,7 +77,7 @@ struct ford_impl<1>
// CurrentMultiIndex: Array<...>
// RemainLengths: Sequence<...>
template
<
class
F
,
class
CurrentMultiIndex
,
class
RemainLengths
>
__host__
__device__
void
__host__
__device__
constexpr
void
operator
()(
F
f
,
CurrentMultiIndex
current_multi_id
,
RemainLengths
)
const
{
static_assert
(
RemainLengths
::
GetSize
()
==
1
,
"wrong!"
);
...
...
@@ -97,7 +97,7 @@ struct ford
{
// F signature: F(Array<...> multi_id)
template
<
class
F
>
__host__
__device__
void
operator
()(
F
f
)
const
__host__
__device__
constexpr
void
operator
()(
F
f
)
const
{
constexpr
index_t
first_length
=
Lengths
{}.
Front
();
...
...
src/include/gridwise_convolution_implicit_gemm_v1r1_chwn_cyxk_khwn.hip.hpp
View file @
109f1e90
...
...
@@ -103,7 +103,7 @@ struct GridwiseConvolutionImplicitGemm_v1r1_chwn_cyxk_khwn
// tensor view of blockwise input and weight in LDS
// be careful of alignment
constexpr
index_t
max_align
=
mod_conv
::
max
(
InBlockCopyDataPerRead_N
,
constexpr
index_t
max_align
=
mod_conv
::
lcm
(
InBlockCopyDataPerRead_N
,
WeiBlockCopyDataPerRead_K
,
GemmDataPerReadA
,
GemmDataPerReadB
);
...
...
@@ -119,11 +119,11 @@ struct GridwiseConvolutionImplicitGemm_v1r1_chwn_cyxk_khwn
constexpr
auto
wei_cyx_k_block_desc
=
make_ConstantTensorDescriptor_aligned
(
Sequence
<
CPerBlock
*
Y
*
X
,
KPerBlock
>
{},
Number
<
mod_conv
::
max
(
WeiBlockCopyDataPerRead_K
,
GemmDataPerReadA
)
>
{});
Number
<
mod_conv
::
lcm
(
WeiBlockCopyDataPerRead_K
,
GemmDataPerReadA
)
>
{});
constexpr
auto
wei_c_y_x_k_block_desc
=
make_ConstantTensorDescriptor_aligned
(
Sequence
<
CPerBlock
,
Y
,
X
,
KPerBlock
>
{},
Number
<
mod_conv
::
max
(
WeiBlockCopyDataPerRead_K
,
GemmDataPerReadA
)
>
{});
Number
<
mod_conv
::
lcm
(
WeiBlockCopyDataPerRead_K
,
GemmDataPerReadA
)
>
{});
// tensor view of threadwise output in register
constexpr
auto
out_k_h_w_n_thread_desc
=
make_ConstantTensorDescriptor
(
...
...
src/include/gridwise_convolution_implicit_gemm_v1r2_chwn_cyxk_khwn.hip.hpp
View file @
109f1e90
...
...
@@ -104,7 +104,7 @@ struct GridwiseConvolutionImplicitGemm_v1r2_chwn_cyxk_khwn
// LDS tensor view
// be careful of alignment
constexpr
index_t
max_align
=
mod_conv
::
max
(
InBlockCopyDataPerRead_N
,
constexpr
index_t
max_align
=
mod_conv
::
lcm
(
InBlockCopyDataPerRead_N
,
WeiBlockCopyDataPerRead_K
,
GemmDataPerReadA
,
GemmDataPerReadB
);
...
...
@@ -120,7 +120,7 @@ struct GridwiseConvolutionImplicitGemm_v1r2_chwn_cyxk_khwn
constexpr
auto
wei_c_x_k_block_desc
=
make_ConstantTensorDescriptor_aligned
(
Sequence
<
CPerBlock
,
X
,
KPerBlock
>
{},
Number
<
mod_conv
::
max
(
WeiBlockCopyDataPerRead_K
,
GemmDataPerReadA
)
>
{});
Number
<
mod_conv
::
lcm
(
WeiBlockCopyDataPerRead_K
,
GemmDataPerReadA
)
>
{});
// tensor view of threadwise output in register
constexpr
auto
out_k_h_w_n_thread_desc
=
make_ConstantTensorDescriptor
(
...
...
src/include/gridwise_convolution_implicit_gemm_v1r2_nchw_cyxk_khwn.hip.hpp
View file @
109f1e90
...
...
@@ -108,7 +108,7 @@ struct GridwiseConvolutionImplicitGemm_v1r2_nchw_cyxk_khwn
// LDS tensor view
// be careful of alignment
constexpr
index_t
max_align
=
mod_conv
::
max
(
InBlockReorderDataPerWrite_N
,
constexpr
index_t
max_align
=
mod_conv
::
lcm
(
InBlockReorderDataPerWrite_N
,
WeiBlockCopyDataPerRead_K
,
GemmDataPerReadA
,
GemmDataPerReadB
);
...
...
src/include/gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn.hip.hpp
View file @
109f1e90
...
...
@@ -99,7 +99,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn
// LDS tensor view
// be careful of alignment
constexpr
index_t
max_align
=
mod_conv
::
max
(
InBlockCopyDataPerRead_N
,
constexpr
index_t
max_align
=
mod_conv
::
lcm
(
InBlockCopyDataPerRead_N
,
WeiBlockCopyDataPerRead_K
,
GemmDataPerReadA
,
GemmDataPerReadB
);
...
...
@@ -115,7 +115,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn
constexpr
auto
wei_c_k_block_desc
=
make_ConstantTensorDescriptor_aligned
(
Sequence
<
CPerBlock
,
KPerBlock
>
{},
Number
<
mod_conv
::
max
(
WeiBlockCopyDataPerRead_K
,
GemmDataPerReadA
)
>
{});
Number
<
mod_conv
::
lcm
(
WeiBlockCopyDataPerRead_K
,
GemmDataPerReadA
)
>
{});
// tensor view of threadwise output in register
constexpr
auto
out_k_h_w_n_thread_desc
=
make_ConstantTensorDescriptor
(
...
...
src/include/gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_chwn_cyxk_khwn.hip.hpp
View file @
109f1e90
...
...
@@ -104,7 +104,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_chwn_cyxk_khwn
// LDS tensor view
// be careful of alignment
constexpr
index_t
max_align
=
mod_conv
::
max
(
InBlockCopyDataPerRead_N
,
constexpr
index_t
max_align
=
mod_conv
::
lcm
(
InBlockCopyDataPerRead_N
,
WeiBlockCopyDataPerRead_K
,
GemmDataPerReadA
,
GemmDataPerReadB
);
...
...
@@ -120,7 +120,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_chwn_cyxk_khwn
constexpr
auto
wei_c_k_block_desc
=
make_ConstantTensorDescriptor_aligned
(
Sequence
<
CPerBlock
,
KPerBlock
>
{},
Number
<
mod_conv
::
max
(
WeiBlockCopyDataPerRead_K
,
GemmDataPerReadA
)
>
{});
Number
<
mod_conv
::
lcm
(
WeiBlockCopyDataPerRead_K
,
GemmDataPerReadA
)
>
{});
// tensor view of threadwise output in register
constexpr
auto
out_k_h_w_n_thread_desc
=
make_ConstantTensorDescriptor_packed
(
...
...
src/include/gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_nchw_cyxk_khwn.hip.hpp
View file @
109f1e90
...
...
@@ -106,7 +106,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_khwn
// LDS tensor view
// be careful of alignment
constexpr
index_t
max_align
=
mod_conv
::
max
(
InBlockReorderDataPerWrite_N
,
constexpr
index_t
max_align
=
mod_conv
::
lcm
(
InBlockReorderDataPerWrite_N
,
WeiBlockCopyDataPerRead_K
,
GemmDataPerReadA
,
GemmDataPerReadB
);
...
...
@@ -122,7 +122,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_khwn
constexpr
auto
wei_c_k_block_desc
=
make_ConstantTensorDescriptor_aligned
(
Sequence
<
CPerBlock
,
KPerBlock
>
{},
Number
<
mod_conv
::
max
(
WeiBlockCopyDataPerRead_K
,
GemmDataPerReadA
)
>
{});
Number
<
mod_conv
::
lcm
(
WeiBlockCopyDataPerRead_K
,
GemmDataPerReadA
)
>
{});
// tensor view of threadwise output in register
constexpr
auto
out_k_h_w_n_thread_desc
=
make_ConstantTensorDescriptor_packed
(
...
...
src/include/gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_nchw_cyxk_nkhw.hip.hpp
View file @
109f1e90
...
...
@@ -105,7 +105,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_nkhw
// LDS tensor view
// be careful of alignment
constexpr
index_t
max_align
=
mod_conv
::
max
(
InBlockReorderDataPerWrite_N
,
constexpr
index_t
max_align
=
mod_conv
::
lcm
(
InBlockReorderDataPerWrite_N
,
WeiBlockCopyDataPerRead_K
,
GemmDataPerReadA
,
GemmDataPerReadB
);
...
...
@@ -121,7 +121,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_nkhw
constexpr
auto
wei_c_k_block_desc
=
make_ConstantTensorDescriptor_aligned
(
Sequence
<
CPerBlock
,
KPerBlock
>
{},
Number
<
mod_conv
::
max
(
WeiBlockCopyDataPerRead_K
,
GemmDataPerReadA
)
>
{});
Number
<
mod_conv
::
lcm
(
WeiBlockCopyDataPerRead_K
,
GemmDataPerReadA
)
>
{});
// tensor view of threadwise output in register
constexpr
auto
out_k_h_w_n_thread_desc
=
make_ConstantTensorDescriptor_packed
(
...
...
src/include/gridwise_convolution_implicit_gemm_v1r3_nchw_cyxk_khwn.hip.hpp
View file @
109f1e90
...
...
@@ -104,7 +104,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_khwn
// LDS tensor view
// be careful of alignment
constexpr
index_t
max_align
=
mod_conv
::
max
(
InBlockReorderDataPerWrite_N
,
constexpr
index_t
max_align
=
mod_conv
::
lcm
(
InBlockReorderDataPerWrite_N
,
WeiBlockCopyDataPerRead_K
,
GemmDataPerReadA
,
GemmDataPerReadB
);
...
...
@@ -120,7 +120,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_khwn
constexpr
auto
wei_c_k_block_desc
=
make_ConstantTensorDescriptor_aligned
(
Sequence
<
CPerBlock
,
KPerBlock
>
{},
Number
<
mod_conv
::
max
(
WeiBlockCopyDataPerRead_K
,
GemmDataPerReadA
)
>
{});
Number
<
mod_conv
::
lcm
(
WeiBlockCopyDataPerRead_K
,
GemmDataPerReadA
)
>
{});
// tensor view of threadwise output in register
constexpr
auto
out_k_h_w_n_thread_desc
=
make_ConstantTensorDescriptor
(
...
...
src/include/gridwise_convolution_implicit_gemm_v1r3_nchw_cyxk_nkhw.hip.hpp
View file @
109f1e90
...
...
@@ -103,7 +103,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_nkhw
// LDS tensor view
// be careful of alignment
constexpr
index_t
max_align
=
mod_conv
::
max
(
InBlockReorderDataPerWrite_N
,
constexpr
index_t
max_align
=
mod_conv
::
lcm
(
InBlockReorderDataPerWrite_N
,
WeiBlockCopyDataPerRead_K
,
GemmDataPerReadA
,
GemmDataPerReadB
);
...
...
@@ -119,7 +119,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_nkhw
constexpr
auto
wei_c_k_block_desc
=
make_ConstantTensorDescriptor_aligned
(
Sequence
<
CPerBlock
,
KPerBlock
>
{},
Number
<
mod_conv
::
max
(
WeiBlockCopyDataPerRead_K
,
GemmDataPerReadA
)
>
{});
Number
<
mod_conv
::
lcm
(
WeiBlockCopyDataPerRead_K
,
GemmDataPerReadA
)
>
{});
// tensor view of threadwise output in register
constexpr
auto
out_k_h_w_n_thread_desc
=
make_ConstantTensorDescriptor_packed
(
...
...
src/include/gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hip.hpp
View file @
109f1e90
...
...
@@ -181,7 +181,7 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn
// LDS: be careful of alignment
constexpr
index_t
max_align
=
mod_conv
::
max
(
index_t
(
4
),
InBlockCopyDataPerRead
,
WeiBlockCopyDataPerRead
);
mod_conv
::
lcm
(
index_t
(
4
),
InBlockCopyDataPerRead
,
WeiBlockCopyDataPerRead
);
constexpr
index_t
in_block_space
=
in_cb_block_desc
.
GetElementSpace
(
Number
<
max_align
>
{});
...
...
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment