Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
39821a90
Commit
39821a90
authored
Jan 29, 2021
by
Chao Liu
Browse files
refactor, clean up
parent
01e94729
Changes
8
Hide whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
342 additions
and
579 deletions
+342
-579
composable_kernel/include/kernel_algorithm/dummy_dynamic_transform.hpp
...rnel/include/kernel_algorithm/dummy_dynamic_transform.hpp
+17
-14
composable_kernel/include/tensor_description/dynamic_tensor_descriptor.hpp
.../include/tensor_description/dynamic_tensor_descriptor.hpp
+30
-92
composable_kernel/include/tensor_operation/blockwise_dynamic_tensor_slice_transfer.hpp
...sor_operation/blockwise_dynamic_tensor_slice_transfer.hpp
+13
-13
composable_kernel/include/tensor_operation/gridwise_dynamic_gemm.hpp
...kernel/include/tensor_operation/gridwise_dynamic_gemm.hpp
+75
-14
composable_kernel/include/tensor_operation/threadwise_dynamic_tensor_slice_transfer.hpp
...or_operation/threadwise_dynamic_tensor_slice_transfer.hpp
+188
-430
driver/include/device_dummy_dynamic_transform.hpp
driver/include/device_dummy_dynamic_transform.hpp
+15
-12
driver/include/device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
...convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
+1
-1
driver/src/conv_driver.cpp
driver/src/conv_driver.cpp
+3
-3
No files found.
composable_kernel/include/kernel_algorithm/dummy_dynamic_transform.hpp
View file @
39821a90
...
@@ -232,13 +232,13 @@ struct DummyDynamicTransform_1
...
@@ -232,13 +232,13 @@ struct DummyDynamicTransform_1
auto
in_gemmk_gemmn_coord
=
make_dynamic_tensor_coordinate
(
in_gemmk_gemmn_global_desc
,
idx
);
auto
in_gemmk_gemmn_coord
=
make_dynamic_tensor_coordinate
(
in_gemmk_gemmn_global_desc
,
idx
);
const
auto
in_gemmk_gemmn_coord_
s
te
p
=
const
auto
in_gemmk_gemmn_coord_
i
te
rator
=
make_dynamic_tensor_coordinate_iterator
(
make_dynamic_tensor_coordinate_step
(
in_gemmk_gemmn_global_desc
,
make_multi_index
(
1
,
0
));
in_gemmk_gemmn_global_desc
,
make_multi_index
(
1
,
0
));
for
(
index_t
iter
=
0
;
iter
<
niter
;
++
iter
)
for
(
index_t
iter
=
0
;
iter
<
niter
;
++
iter
)
{
{
move_dynamic_tensor_coordinate
(
move_dynamic_tensor_coordinate
(
in_gemmk_gemmn_global_desc
,
in_gemmk_gemmn_coord
,
in_gemmk_gemmn_coord_
s
te
p
);
in_gemmk_gemmn_global_desc
,
in_gemmk_gemmn_coord
,
in_gemmk_gemmn_coord_
i
te
rator
);
// write
// write
float
value
=
1
;
float
value
=
1
;
...
@@ -352,12 +352,12 @@ struct DummyDynamicTransform_1
...
@@ -352,12 +352,12 @@ struct DummyDynamicTransform_1
auto
in_coord
=
make_dynamic_tensor_coordinate
(
in_n_c_hip_wip_global_desc
,
idx
);
auto
in_coord
=
make_dynamic_tensor_coordinate
(
in_n_c_hip_wip_global_desc
,
idx
);
const
auto
in_coord_
s
te
p
=
make_dynamic_tensor_coordinate_
s
te
p
(
const
auto
in_coord_
i
te
rator
=
make_dynamic_tensor_coordinate_
i
te
rator
(
in_n_c_hip_wip_global_desc
,
make_multi_index
(
1
,
0
,
0
,
0
));
in_n_c_hip_wip_global_desc
,
make_multi_index
(
1
,
0
,
0
,
0
));
for
(
index_t
iter
=
0
;
iter
<
niter
;
++
iter
)
for
(
index_t
iter
=
0
;
iter
<
niter
;
++
iter
)
{
{
move_dynamic_tensor_coordinate
(
in_n_c_hip_wip_global_desc
,
in_coord
,
in_coord_
s
te
p
);
move_dynamic_tensor_coordinate
(
in_n_c_hip_wip_global_desc
,
in_coord
,
in_coord_
i
te
rator
);
// write
// write
float
value
=
1
;
float
value
=
1
;
...
@@ -430,21 +430,24 @@ struct DummyDynamicTransform_fwd_v4r4
...
@@ -430,21 +430,24 @@ struct DummyDynamicTransform_fwd_v4r4
auto
in_gemmk_gemmn_gemmkpack_coord
=
auto
in_gemmk_gemmn_gemmkpack_coord
=
make_dynamic_tensor_coordinate
(
in_gemmk_gemmn_gemmkpack_global_desc
,
idx
);
make_dynamic_tensor_coordinate
(
in_gemmk_gemmn_gemmkpack_global_desc
,
idx
);
const
auto
in_gemmk_gemmn_gemmkpack_coord_step_0_0_1
=
make_dynamic_tensor_coordinate_step
(
const
auto
in_gemmk_gemmn_gemmkpack_coord_iterator_0_0_1
=
in_gemmk_gemmn_gemmkpack_global_desc
,
make_multi_index
(
0
,
0
,
1
));
make_dynamic_tensor_coordinate_iterator
(
in_gemmk_gemmn_gemmkpack_global_desc
,
make_multi_index
(
0
,
0
,
1
));
const
auto
in_gemmk_gemmn_gemmkpack_coord_step_0_1_0
=
make_dynamic_tensor_coordinate_step
(
const
auto
in_gemmk_gemmn_gemmkpack_coord_iterator_0_1_0
=
in_gemmk_gemmn_gemmkpack_global_desc
,
make_multi_index
(
0
,
1
,
0
));
make_dynamic_tensor_coordinate_iterator
(
in_gemmk_gemmn_gemmkpack_global_desc
,
make_multi_index
(
0
,
1
,
0
));
const
auto
in_gemmk_gemmn_gemmkpack_coord_step_1_0_0
=
make_dynamic_tensor_coordinate_step
(
const
auto
in_gemmk_gemmn_gemmkpack_coord_iterator_1_0_0
=
in_gemmk_gemmn_gemmkpack_global_desc
,
make_multi_index
(
1
,
0
,
0
));
make_dynamic_tensor_coordinate_iterator
(
in_gemmk_gemmn_gemmkpack_global_desc
,
make_multi_index
(
1
,
0
,
0
));
// move (0, 0, 1)
// move (0, 0, 1)
for
(
index_t
iter
=
0
;
iter
<
niter
;
++
iter
)
for
(
index_t
iter
=
0
;
iter
<
niter
;
++
iter
)
{
{
move_dynamic_tensor_coordinate
(
in_gemmk_gemmn_gemmkpack_global_desc
,
move_dynamic_tensor_coordinate
(
in_gemmk_gemmn_gemmkpack_global_desc
,
in_gemmk_gemmn_gemmkpack_coord
,
in_gemmk_gemmn_gemmkpack_coord
,
in_gemmk_gemmn_gemmkpack_coord_
s
te
p
_0_0_1
);
in_gemmk_gemmn_gemmkpack_coord_
i
te
rator
_0_0_1
);
// write
// write
float
value
=
1
;
float
value
=
1
;
...
@@ -476,7 +479,7 @@ struct DummyDynamicTransform_fwd_v4r4
...
@@ -476,7 +479,7 @@ struct DummyDynamicTransform_fwd_v4r4
{
{
move_dynamic_tensor_coordinate
(
in_gemmk_gemmn_gemmkpack_global_desc
,
move_dynamic_tensor_coordinate
(
in_gemmk_gemmn_gemmkpack_global_desc
,
in_gemmk_gemmn_gemmkpack_coord
,
in_gemmk_gemmn_gemmkpack_coord
,
in_gemmk_gemmn_gemmkpack_coord_
s
te
p
_0_1_0
);
in_gemmk_gemmn_gemmkpack_coord_
i
te
rator
_0_1_0
);
// write
// write
float
value
=
1
;
float
value
=
1
;
...
@@ -508,7 +511,7 @@ struct DummyDynamicTransform_fwd_v4r4
...
@@ -508,7 +511,7 @@ struct DummyDynamicTransform_fwd_v4r4
{
{
move_dynamic_tensor_coordinate
(
in_gemmk_gemmn_gemmkpack_global_desc
,
move_dynamic_tensor_coordinate
(
in_gemmk_gemmn_gemmkpack_global_desc
,
in_gemmk_gemmn_gemmkpack_coord
,
in_gemmk_gemmn_gemmkpack_coord
,
in_gemmk_gemmn_gemmkpack_coord_
s
te
p
_1_0_0
);
in_gemmk_gemmn_gemmkpack_coord_
i
te
rator
_1_0_0
);
// write
// write
float
value
=
1
;
float
value
=
1
;
...
...
composable_kernel/include/tensor_description/dynamic_tensor_descriptor.hpp
View file @
39821a90
...
@@ -9,12 +9,8 @@ namespace ck {
...
@@ -9,12 +9,8 @@ namespace ck {
template
<
index_t
NDimHidden
,
typename
VisibleDimensionIds
>
template
<
index_t
NDimHidden
,
typename
VisibleDimensionIds
>
struct
DynamicTensorCoordinate
;
struct
DynamicTensorCoordinate
;
#if 0 // hack
template
<
index_t
NTransform
,
index_t
NDimVisible
,
typename
UpdateLowerIndexHack
>
template <index_t NTransform, index_t NDimVisible>
struct
DynamicTensorCoordinateIterator
;
#else
template
<
index_t
NTransform
,
index_t
NDimVisible
,
typename
HackCalculateLowerIndexDiff
>
#endif
struct
DynamicTensorCoordinateStep
;
// Transforms: Tuple<transforms...>
// Transforms: Tuple<transforms...>
// LowerDimensionIdss : Tuple<Sequence<...>, ...>
// LowerDimensionIdss : Tuple<Sequence<...>, ...>
...
@@ -193,18 +189,14 @@ struct DynamicTensorCoordinate
...
@@ -193,18 +189,14 @@ struct DynamicTensorCoordinate
HiddenIndex
idx_hidden_
;
HiddenIndex
idx_hidden_
;
};
};
#if 0 // hack
template
<
index_t
NTransform
,
index_t
NDimVisible
,
typename
UpdateLowerIndexHack
>
template <index_t NTransform, index_t NDimVisible>
struct
DynamicTensorCoordinateIterator
#else
template
<
index_t
NTransform
,
index_t
NDimVisible
,
typename
HackCalculateLowerIndexDiff
>
#endif
struct
DynamicTensorCoordinateStep
{
{
// TODO make these private
// TODO make these private
using
VisibleIndex
=
MultiIndex
<
NDimVisible
>
;
using
VisibleIndex
=
MultiIndex
<
NDimVisible
>
;
public:
public:
__host__
__device__
explicit
constexpr
DynamicTensorCoordinate
S
te
p
(
__host__
__device__
explicit
constexpr
DynamicTensorCoordinate
I
te
rator
(
const
VisibleIndex
&
idx_diff_visible
,
const
MultiIndex
<
NTransform
>&
do_transforms
)
const
VisibleIndex
&
idx_diff_visible
,
const
MultiIndex
<
NTransform
>&
do_transforms
)
:
idx_diff_visible_
{
idx_diff_visible
},
do_transforms_
{
do_transforms
}
:
idx_diff_visible_
{
idx_diff_visible
},
do_transforms_
{
do_transforms
}
{
{
...
@@ -221,10 +213,8 @@ struct DynamicTensorCoordinateStep
...
@@ -221,10 +213,8 @@ struct DynamicTensorCoordinateStep
const
VisibleIndex
idx_diff_visible_
;
const
VisibleIndex
idx_diff_visible_
;
const
MultiIndex
<
NTransform
>
do_transforms_
;
const
MultiIndex
<
NTransform
>
do_transforms_
;
#if 1 // hack
// HACK: control UpdateLowerIndex()
// HACK: control CalculateLowerIndexDiff for DynamicMerge using ing hack
static
constexpr
UpdateLowerIndexHack
update_lower_index_hack_
;
static
constexpr
HackCalculateLowerIndexDiff
hack_calculate_lower_index_diff_
;
#endif
};
};
// TODO: How to fix this? It uses an struct instead of lambda because lambda
// TODO: How to fix this? It uses an struct instead of lambda because lambda
...
@@ -350,9 +340,11 @@ __host__ __device__ constexpr auto make_dynamic_tensor_coordinate(const TensorDe
...
@@ -350,9 +340,11 @@ __host__ __device__ constexpr auto make_dynamic_tensor_coordinate(const TensorDe
return
DynamicTensorCoordinate
<
ndim_hidden
,
decltype
(
visible_dim_ids
)
>
{
idx_hidden
};
return
DynamicTensorCoordinate
<
ndim_hidden
,
decltype
(
visible_dim_ids
)
>
{
idx_hidden
};
}
}
template
<
typename
TensorDesc
,
typename
VisibleIndex
>
// UpdateLowerIndexHack: Sequence<...>
__host__
__device__
constexpr
auto
// HACK: control UpdateLowerIndex
make_dynamic_tensor_coordinate_step
(
const
TensorDesc
&
,
const
VisibleIndex
&
idx_diff_visible
)
template
<
typename
TensorDesc
,
typename
VisibleIndex
,
typename
UpdateLowerIndexHack
>
__host__
__device__
constexpr
auto
make_dynamic_tensor_coordinate_iterator
(
const
TensorDesc
&
,
const
VisibleIndex
&
idx_diff_visible
,
UpdateLowerIndexHack
)
{
{
static_assert
(
TensorDesc
::
GetNumOfDimension
()
==
VisibleIndex
::
Size
(),
static_assert
(
TensorDesc
::
GetNumOfDimension
()
==
VisibleIndex
::
Size
(),
"wrong! # of dimension inconsistent"
);
"wrong! # of dimension inconsistent"
);
...
@@ -362,6 +354,8 @@ make_dynamic_tensor_coordinate_step(const TensorDesc&, const VisibleIndex& idx_d
...
@@ -362,6 +354,8 @@ make_dynamic_tensor_coordinate_step(const TensorDesc&, const VisibleIndex& idx_d
constexpr
index_t
ndim_visible
=
TensorDesc
::
GetNumOfVisibleDimension
();
constexpr
index_t
ndim_visible
=
TensorDesc
::
GetNumOfVisibleDimension
();
constexpr
auto
visible_dim_ids
=
TensorDesc
::
GetVisibleDimensionIds
();
constexpr
auto
visible_dim_ids
=
TensorDesc
::
GetVisibleDimensionIds
();
static_assert
(
UpdateLowerIndexHack
::
Size
()
==
ntransform
,
"wrong!"
);
// use index_t for boolean type
// use index_t for boolean type
auto
do_transforms
=
make_zero_multi_index
<
ntransform
>
();
auto
do_transforms
=
make_zero_multi_index
<
ntransform
>
();
auto
is_non_zero_diff
=
make_zero_multi_index
<
ndim_hidden
>
();
auto
is_non_zero_diff
=
make_zero_multi_index
<
ndim_hidden
>
();
...
@@ -397,78 +391,23 @@ make_dynamic_tensor_coordinate_step(const TensorDesc&, const VisibleIndex& idx_d
...
@@ -397,78 +391,23 @@ make_dynamic_tensor_coordinate_step(const TensorDesc&, const VisibleIndex& idx_d
set_container_subset
(
is_non_zero_diff
,
dims_low
,
non_zero_diff_pick_low
);
set_container_subset
(
is_non_zero_diff
,
dims_low
,
non_zero_diff_pick_low
);
});
});
#if 0 // hack
return
DynamicTensorCoordinateIterator
<
ntransform
,
ndim_visible
,
UpdateLowerIndexHack
>
{
return DynamicTensorCoordinateStep<ntransform, ndim_visible>{idx_diff_visible, do_transforms};
#else
return
DynamicTensorCoordinateStep
<
ntransform
,
ndim_visible
,
typename
uniform_sequence_gen
<
ntransform
,
0
>::
type
>
{
idx_diff_visible
,
do_transforms
};
idx_diff_visible
,
do_transforms
};
#endif
}
}
#if 0 // hack
template
<
typename
TensorDesc
,
typename
VisibleIndex
>
template
<
typename
TensorDesc
,
typename
VisibleIndex
>
#else
__host__
__device__
constexpr
auto
// HACK: control CalculateLowerIndexDiff for DynamicMerge using ing hack
make_dynamic_tensor_coordinate_iterator
(
const
TensorDesc
&
,
const
VisibleIndex
&
idx_diff_visible
)
template
<
typename
TensorDesc
,
typename
VisibleIndex
,
typename
HackCalculateLowerIndexDiff
>
#endif
__host__
__device__
constexpr
auto
make_dynamic_tensor_coordinate_step_hack
(
const
TensorDesc
&
,
const
VisibleIndex
&
idx_diff_visible
,
HackCalculateLowerIndexDiff
)
{
{
static_assert
(
TensorDesc
::
GetNumOfDimension
()
==
VisibleIndex
::
Size
(),
constexpr
index_t
ntransform
=
TensorDesc
::
GetNumOfTransform
();
"wrong! # of dimension inconsistent"
);
constexpr
index_t
ntransform
=
TensorDesc
::
GetNumOfTransform
();
constexpr
index_t
ndim_hidden
=
TensorDesc
::
GetNumOfHiddenDimension
();
constexpr
index_t
ndim_visible
=
TensorDesc
::
GetNumOfVisibleDimension
();
constexpr
auto
visible_dim_ids
=
TensorDesc
::
GetVisibleDimensionIds
();
static_assert
(
HackCalculateLowerIndexDiff
::
Size
()
==
ntransform
,
"wrong!"
);
// use index_t for boolean type
auto
do_transforms
=
make_zero_multi_index
<
ntransform
>
();
auto
is_non_zero_diff
=
make_zero_multi_index
<
ndim_hidden
>
();
// decide do_transform by checkout non-zero index diff components
MultiIndex
<
VisibleIndex
::
Size
()
>
non_zero_diff_pick_visible
;
static_for
<
0
,
ndim_visible
,
1
>
{}(
[
&
](
auto
i
)
{
non_zero_diff_pick_visible
(
i
)
=
(
idx_diff_visible
[
i
]
!=
0
);
});
set_container_subset
(
is_non_zero_diff
,
visible_dim_ids
,
non_zero_diff_pick_visible
);
static_for
<
ntransform
-
1
,
-
1
,
-
1
>
{}([
&
](
auto
itran
)
{
constexpr
auto
dims_low
=
TensorDesc
::
GetLowerDimensionIdss
().
At
(
itran
);
constexpr
auto
dims_up
=
TensorDesc
::
GetUpperDimensionIdss
().
At
(
itran
);
const
auto
non_zero_diff_pick_up
=
get_container_subset
(
is_non_zero_diff
,
dims_up
);
MultiIndex
<
dims_low
.
Size
()
>
non_zero_diff_pick_low
;
// if any of upper index diff components is non-zero, then
// 1) Need to do this transform
// 2) all components of lower index diff will assume to be non-zero and need to be
// computed
const
bool
idx_diff_up_has_non_zero
=
container_reduce
(
non_zero_diff_pick_up
,
[](
auto
a
,
auto
b
)
constexpr
{
return
a
or
b
;
},
false
);
do_transforms
(
itran
)
=
idx_diff_up_has_non_zero
;
static_for
<
0
,
dims_low
.
Size
(),
1
>
{}(
[
&
](
auto
i
)
{
non_zero_diff_pick_low
(
i
)
=
idx_diff_up_has_non_zero
;
});
set_container_subset
(
is_non_zero_diff
,
dims_low
,
non_zero_diff_pick_low
);
});
return
D
ynamic
T
ensor
C
oordinate
Step
<
ntransform
,
ndim_visible
,
HackCalculateLowerIndexDiff
>
{
return
make_d
ynamic
_t
ensor
_c
oordinate
_iterator
(
idx_diff_visible
,
do_transforms
}
;
TensorDesc
{},
idx_diff_visible
,
typename
uniform_sequence_gen
<
ntransform
,
0
>::
type
{})
;
}
}
template
<
typename
TensorDesc
,
typename
TensorCoord
,
typename
TensorCoordStep
>
template
<
typename
TensorDesc
,
typename
TensorCoord
,
typename
TensorCoordIterator
>
__host__
__device__
constexpr
void
move_dynamic_tensor_coordinate
(
const
TensorDesc
&
tensor_desc
,
__host__
__device__
constexpr
void
move_dynamic_tensor_coordinate
(
TensorCoord
&
coord
,
const
TensorDesc
&
tensor_desc
,
TensorCoord
&
coord
,
const
TensorCoordIterator
&
coord_iterator
)
const
TensorCoordStep
&
coord_step
)
{
{
constexpr
index_t
ndim_hidden
=
TensorDesc
::
GetNumOfHiddenDimension
();
constexpr
index_t
ndim_hidden
=
TensorDesc
::
GetNumOfHiddenDimension
();
constexpr
index_t
ndim_visible
=
TensorDesc
::
GetNumOfVisibleDimension
();
constexpr
index_t
ndim_visible
=
TensorDesc
::
GetNumOfVisibleDimension
();
...
@@ -480,8 +419,9 @@ __host__ __device__ constexpr void move_dynamic_tensor_coordinate(const TensorDe
...
@@ -480,8 +419,9 @@ __host__ __device__ constexpr void move_dynamic_tensor_coordinate(const TensorDe
auto
idx_diff_hidden
=
make_zero_multi_index
<
ndim_hidden
>
();
auto
idx_diff_hidden
=
make_zero_multi_index
<
ndim_hidden
>
();
// initialize visible index diff
// initialize visible index diff
set_container_subset
(
set_container_subset
(
idx_diff_hidden
,
idx_diff_hidden
,
TensorDesc
::
GetVisibleDimensionIds
(),
coord_step
.
GetVisibleIndexDiff
());
TensorDesc
::
GetVisibleDimensionIds
(),
coord_iterator
.
GetVisibleIndexDiff
());
// this is what needs to be updated
// this is what needs to be updated
auto
&
idx_hidden
=
coord
.
GetHiddenIndex
();
auto
&
idx_hidden
=
coord
.
GetHiddenIndex
();
...
@@ -490,13 +430,13 @@ __host__ __device__ constexpr void move_dynamic_tensor_coordinate(const TensorDe
...
@@ -490,13 +430,13 @@ __host__ __device__ constexpr void move_dynamic_tensor_coordinate(const TensorDe
auto
idx_hidden_pick_visible
=
auto
idx_hidden_pick_visible
=
get_container_subset
(
idx_hidden
,
TensorDesc
::
GetVisibleDimensionIds
());
get_container_subset
(
idx_hidden
,
TensorDesc
::
GetVisibleDimensionIds
());
idx_hidden_pick_visible
+=
coord_
s
te
p
.
GetIndexDiff
();
idx_hidden_pick_visible
+=
coord_
i
te
rator
.
GetIndexDiff
();
set_container_subset
(
idx_hidden
,
TensorDesc
::
GetVisibleDimensionIds
(),
idx_hidden_pick_visible
);
set_container_subset
(
idx_hidden
,
TensorDesc
::
GetVisibleDimensionIds
(),
idx_hidden_pick_visible
);
// update rest of hidden index
// update rest of hidden index
static_for
<
ntransform
-
1
,
-
1
,
-
1
>
{}([
&
](
auto
itran
)
{
static_for
<
ntransform
-
1
,
-
1
,
-
1
>
{}([
&
](
auto
itran
)
{
if
(
coord_
s
te
p
.
do_transforms_
[
itran
])
if
(
coord_
i
te
rator
.
do_transforms_
[
itran
])
{
{
const
auto
&
tran
=
tensor_desc
.
GetTransforms
().
At
(
itran
);
const
auto
&
tran
=
tensor_desc
.
GetTransforms
().
At
(
itran
);
constexpr
auto
dims_low
=
TensorDesc
::
GetLowerDimensionIdss
().
At
(
itran
);
constexpr
auto
dims_low
=
TensorDesc
::
GetLowerDimensionIdss
().
At
(
itran
);
...
@@ -509,9 +449,7 @@ __host__ __device__ constexpr void move_dynamic_tensor_coordinate(const TensorDe
...
@@ -509,9 +449,7 @@ __host__ __device__ constexpr void move_dynamic_tensor_coordinate(const TensorDe
MultiIndex
<
dims_low
.
Size
()
>
idx_diff_low
;
MultiIndex
<
dims_low
.
Size
()
>
idx_diff_low
;
// HACK: control UpdateLowerIndex for DynamicMerge using hack
// HACK: control UpdateLowerIndex for DynamicMerge using hack
// TODO remove hack
constexpr
index_t
Hack
=
decltype
(
coord_iterator
.
update_lower_index_hack_
)
::
At
(
itran
);
constexpr
index_t
Hack
=
decltype
(
coord_step
.
hack_calculate_lower_index_diff_
)
::
At
(
itran
);
tran
.
UpdateLowerIndex
(
idx_diff_low
,
idx_diff_up
,
idx_low
,
idx_up_new
,
Number
<
Hack
>
{});
tran
.
UpdateLowerIndex
(
idx_diff_low
,
idx_diff_up
,
idx_low
,
idx_up_new
,
Number
<
Hack
>
{});
...
@@ -579,7 +517,7 @@ using DynamicTensorCoordinate_t = decltype(make_dynamic_tensor_coordinate(
...
@@ -579,7 +517,7 @@ using DynamicTensorCoordinate_t = decltype(make_dynamic_tensor_coordinate(
TensorDesc
{},
MultiIndex
<
remove_cv_t
<
remove_reference_t
<
TensorDesc
>>::
GetNumOfDimension
()
>
{}));
TensorDesc
{},
MultiIndex
<
remove_cv_t
<
remove_reference_t
<
TensorDesc
>>::
GetNumOfDimension
()
>
{}));
template
<
typename
TensorDesc
>
template
<
typename
TensorDesc
>
using
DynamicTensorCoordinate
S
te
p
_t
=
decltype
(
make_dynamic_tensor_coordinate_
s
te
p
(
using
DynamicTensorCoordinate
I
te
rator
_t
=
decltype
(
make_dynamic_tensor_coordinate_
i
te
rator
(
TensorDesc
{},
MultiIndex
<
remove_cv_t
<
remove_reference_t
<
TensorDesc
>>::
GetNumOfDimension
()
>
{}));
TensorDesc
{},
MultiIndex
<
remove_cv_t
<
remove_reference_t
<
TensorDesc
>>::
GetNumOfDimension
()
>
{}));
}
// namespace ck
}
// namespace ck
...
...
composable_kernel/include/tensor_operation/blockwise_dynamic_tensor_slice_transfer.hpp
View file @
39821a90
...
@@ -87,21 +87,15 @@ struct BlockwiseDynamicTensorSliceTransfer_v4
...
@@ -87,21 +87,15 @@ struct BlockwiseDynamicTensorSliceTransfer_v4
return
thread_cluster_id
*
ThreadSliceLengths
{};
return
thread_cluster_id
*
ThreadSliceLengths
{};
}
}
__device__
void
RunRead
(
const
SrcDesc
&
src_desc
,
const
SrcData
*
p_src
)
template
<
typename
SrcIteratorHacks
>
__device__
void
RunRead
(
const
SrcDesc
&
src_desc
,
const
SrcData
*
p_src
,
const
SrcIteratorHacks
&
src_iterator_hacks
)
{
{
if
(
BlockSize
==
thread_cluster_desc_
.
GetElementSize
()
or
if
(
BlockSize
==
thread_cluster_desc_
.
GetElementSize
()
or
get_thread_local_1d_id
()
<
thread_cluster_desc_
.
GetElementSize
())
get_thread_local_1d_id
()
<
thread_cluster_desc_
.
GetElementSize
())
{
{
threadwise_transfer_
.
RunRead
(
src_desc
,
p_src
);
threadwise_transfer_
.
RunRead
(
src_desc
,
p_src
,
src_iterator_hacks
);
}
}
__device__
void
RunRead_hack
(
const
SrcDesc
&
src_desc
,
const
SrcData
*
p_src
)
{
if
(
BlockSize
==
thread_cluster_desc_
.
GetElementSize
()
or
get_thread_local_1d_id
()
<
thread_cluster_desc_
.
GetElementSize
())
{
threadwise_transfer_
.
RunRead_hack
(
src_desc
,
p_src
);
}
}
}
}
...
@@ -123,12 +117,18 @@ struct BlockwiseDynamicTensorSliceTransfer_v4
...
@@ -123,12 +117,18 @@ struct BlockwiseDynamicTensorSliceTransfer_v4
}
}
}
}
__device__
void
MoveSrcSliceWindow_hack
(
const
SrcDesc
&
src_desc
,
const
Index
&
step
)
// SrcMoveSliceWindowIteratorHack to control index calculation move slice window
template
<
typename
SrcMoveSliceWindowIteratorHack
>
__device__
void
MoveSrcSliceWindow
(
const
SrcDesc
&
src_desc
,
const
Index
&
step
,
const
SrcMoveSliceWindowIteratorHack
&
src_move_slice_window_iterator_hack
)
{
{
if
(
BlockSize
==
thread_cluster_desc_
.
GetElementSize
()
or
if
(
BlockSize
==
thread_cluster_desc_
.
GetElementSize
()
or
get_thread_local_1d_id
()
<
thread_cluster_desc_
.
GetElementSize
())
get_thread_local_1d_id
()
<
thread_cluster_desc_
.
GetElementSize
())
{
{
threadwise_transfer_
.
MoveSrcSliceWindow_hack
(
src_desc
,
step
);
threadwise_transfer_
.
MoveSrcSliceWindow
(
src_desc
,
step
,
src_move_slice_window_iterator_hack
);
}
}
}
}
...
...
composable_kernel/include/tensor_operation/gridwise_dynamic_gemm.hpp
View file @
39821a90
...
@@ -259,10 +259,47 @@ struct GridwiseDynamicGemm_km_kn_mn_v1
...
@@ -259,10 +259,47 @@ struct GridwiseDynamicGemm_km_kn_mn_v1
constexpr
auto
a_block_slice_copy_step
=
make_multi_index
(
KPerBlock
,
0
);
constexpr
auto
a_block_slice_copy_step
=
make_multi_index
(
KPerBlock
,
0
);
constexpr
auto
b_block_slice_copy_step
=
make_multi_index
(
KPerBlock
,
0
);
constexpr
auto
b_block_slice_copy_step
=
make_multi_index
(
KPerBlock
,
0
);
// hack to control index calculation when iterating over a_k_m_global tensor
constexpr
auto
a_k_m_global_iterator_hacks
=
make_tuple
(
make_tuple
(
Sequence
<
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
>
{}),
make_tuple
(
Sequence
<
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
>
{}));
constexpr
auto
a_k_m_global_reset_iterator_hacks
=
make_tuple
(
make_tuple
(
Sequence
<
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
>
{}),
make_tuple
(
Sequence
<
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
>
{}));
// hack to control index calculation when iterating over b_k_n_global tensor
#if 0
// for padded input
constexpr auto b_k_n_global_iterator_hacks =
make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0>{},
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1>{}),
make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0>{},
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2>{}));
constexpr auto b_k_n_global_move_slice_window_iterator_hack =
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2>{};
#elif
0
// for non-padded input
constexpr
auto
b_k_n_global_iterator_hacks
=
make_tuple
(
make_tuple
(
Sequence
<
0
,
0
,
0
,
0
,
0
,
1
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
1
>
{}),
make_tuple
(
Sequence
<
0
,
0
,
0
,
0
,
0
,
2
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
2
>
{}));
constexpr
auto
b_k_n_global_move_slice_window_iterator_hack
=
Sequence
<
0
,
0
,
0
,
0
,
0
,
1
,
2
>
{};
#elif 1
// for 1x1 case
constexpr
auto
b_k_n_global_iterator_hacks
=
make_tuple
(
make_tuple
(
Sequence
<
0
,
1
,
0
>
{},
Sequence
<
0
,
0
,
1
>
{}),
make_tuple
(
Sequence
<
0
,
2
,
0
>
{},
Sequence
<
0
,
0
,
2
>
{}));
constexpr
auto
b_k_n_global_move_slice_window_iterator_hack
=
Sequence
<
0
,
1
,
2
>
{};
#endif
// LDS double buffer: preload data into LDS
// LDS double buffer: preload data into LDS
{
{
a_blockwise_copy
.
RunRead
(
a_k_m_global_desc
,
p_a_global
);
a_blockwise_copy
.
RunRead
(
a_k_m_global_desc
,
p_a_global
,
a_k_m_global_iterator_hacks
);
b_blockwise_copy
.
RunRead
_hack
(
b_k_n_global_desc
,
p_b_global
);
b_blockwise_copy
.
RunRead
(
b_k_n_global_desc
,
p_b_global
,
b_k_n_global_iterator_hacks
);
a_blockwise_copy
.
RunWrite
(
a_k_m_block_desc
,
p_a_block_double
);
a_blockwise_copy
.
RunWrite
(
a_k_m_block_desc
,
p_a_block_double
);
b_blockwise_copy
.
RunWrite
(
b_k_n_block_desc
,
p_b_block_double
);
b_blockwise_copy
.
RunWrite
(
b_k_n_block_desc
,
p_b_block_double
);
...
@@ -284,14 +321,17 @@ struct GridwiseDynamicGemm_km_kn_mn_v1
...
@@ -284,14 +321,17 @@ struct GridwiseDynamicGemm_km_kn_mn_v1
{
{
// even iteration
// even iteration
a_blockwise_copy
.
MoveSrcSliceWindow
(
a_k_m_global_desc
,
a_block_slice_copy_step
);
a_blockwise_copy
.
MoveSrcSliceWindow
(
a_k_m_global_desc
,
a_block_slice_copy_step
);
b_blockwise_copy
.
MoveSrcSliceWindow_hack
(
b_k_n_global_desc
,
b_blockwise_copy
.
MoveSrcSliceWindow
(
b_k_n_global_desc
,
b_block_slice_copy_step
);
b_block_slice_copy_step
,
b_k_n_global_move_slice_window_iterator_hack
);
__syncthreads
();
__syncthreads
();
// LDS doubel buffer: load next data from device mem
// LDS doubel buffer: load next data from device mem
a_blockwise_copy
.
RunRead
(
a_k_m_global_desc
,
p_a_global
);
a_blockwise_copy
.
RunRead
(
b_blockwise_copy
.
RunRead_hack
(
b_k_n_global_desc
,
p_b_global
);
a_k_m_global_desc
,
p_a_global
,
a_k_m_global_iterator_hacks
);
b_blockwise_copy
.
RunRead
(
b_k_n_global_desc
,
p_b_global
,
b_k_n_global_iterator_hacks
);
// LDS double buffer: GEMM on current data
// LDS double buffer: GEMM on current data
blockwise_gemm
.
Run
(
p_a_block_even
,
p_b_block_even
,
p_c_thread
);
blockwise_gemm
.
Run
(
p_a_block_even
,
p_b_block_even
,
p_c_thread
);
...
@@ -302,14 +342,17 @@ struct GridwiseDynamicGemm_km_kn_mn_v1
...
@@ -302,14 +342,17 @@ struct GridwiseDynamicGemm_km_kn_mn_v1
// odd iteration
// odd iteration
a_blockwise_copy
.
MoveSrcSliceWindow
(
a_k_m_global_desc
,
a_block_slice_copy_step
);
a_blockwise_copy
.
MoveSrcSliceWindow
(
a_k_m_global_desc
,
a_block_slice_copy_step
);
b_blockwise_copy
.
MoveSrcSliceWindow_hack
(
b_k_n_global_desc
,
b_blockwise_copy
.
MoveSrcSliceWindow
(
b_k_n_global_desc
,
b_block_slice_copy_step
);
b_block_slice_copy_step
,
b_k_n_global_move_slice_window_iterator_hack
);
__syncthreads
();
__syncthreads
();
// LDS doubel buffer: load next data from device mem
// LDS doubel buffer: load next data from device mem
a_blockwise_copy
.
RunRead
(
a_k_m_global_desc
,
p_a_global
);
a_blockwise_copy
.
RunRead
(
b_blockwise_copy
.
RunRead_hack
(
b_k_n_global_desc
,
p_b_global
);
a_k_m_global_desc
,
p_a_global
,
a_k_m_global_iterator_hacks
);
b_blockwise_copy
.
RunRead
(
b_k_n_global_desc
,
p_b_global
,
b_k_n_global_iterator_hacks
);
// LDS double buffer: GEMM on current data
// LDS double buffer: GEMM on current data
blockwise_gemm
.
Run
(
p_a_block_odd
,
p_b_block_odd
,
p_c_thread
);
blockwise_gemm
.
Run
(
p_a_block_odd
,
p_b_block_odd
,
p_c_thread
);
...
@@ -326,13 +369,15 @@ struct GridwiseDynamicGemm_km_kn_mn_v1
...
@@ -326,13 +369,15 @@ struct GridwiseDynamicGemm_km_kn_mn_v1
if
constexpr
(
HasDoubleTailKBlockLoop
)
// if has 2 iteration left
if
constexpr
(
HasDoubleTailKBlockLoop
)
// if has 2 iteration left
{
{
a_blockwise_copy
.
MoveSrcSliceWindow
(
a_k_m_global_desc
,
a_block_slice_copy_step
);
a_blockwise_copy
.
MoveSrcSliceWindow
(
a_k_m_global_desc
,
a_block_slice_copy_step
);
b_blockwise_copy
.
MoveSrcSliceWindow_hack
(
b_k_n_global_desc
,
b_block_slice_copy_step
);
b_blockwise_copy
.
MoveSrcSliceWindow
(
b_k_n_global_desc
,
b_block_slice_copy_step
,
b_k_n_global_move_slice_window_iterator_hack
);
__syncthreads
();
__syncthreads
();
// LDS double buffer: load last data from device mem
// LDS double buffer: load last data from device mem
a_blockwise_copy
.
RunRead
(
a_k_m_global_desc
,
p_a_global
);
a_blockwise_copy
.
RunRead
(
a_k_m_global_desc
,
p_a_global
,
a_k_m_global_iterator_hacks
);
b_blockwise_copy
.
RunRead
_hack
(
b_k_n_global_desc
,
p_b_global
);
b_blockwise_copy
.
RunRead
(
b_k_n_global_desc
,
p_b_global
,
b_k_n_global_iterator_hacks
);
// LDS double buffer: GEMM on 2nd-last data
// LDS double buffer: GEMM on 2nd-last data
blockwise_gemm
.
Run
(
p_a_block_double
,
p_b_block_double
,
p_c_thread
);
blockwise_gemm
.
Run
(
p_a_block_double
,
p_b_block_double
,
p_c_thread
);
...
@@ -383,6 +428,18 @@ struct GridwiseDynamicGemm_km_kn_mn_v1
...
@@ -383,6 +428,18 @@ struct GridwiseDynamicGemm_km_kn_mn_v1
const
index_t
n_thread_data_on_global
=
const
index_t
n_thread_data_on_global
=
n_block_data_on_global
+
c_thread_mtx_on_block
.
col
;
n_block_data_on_global
+
c_thread_mtx_on_block
.
col
;
// hack to control index calculation when iterating over c_m0_m1_n0_n1_global tensor
// hack for NKHW format
constexpr
auto
c_m0_m1_n0_n1_global_tensor_iterator_hacks
=
make_tuple
(
make_tuple
(
Sequence
<
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
1
,
0
,
0
>
{},
Sequence
<
0
,
0
,
1
,
0
,
0
>
{}),
make_tuple
(
Sequence
<
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
2
,
0
,
0
>
{},
Sequence
<
0
,
0
,
2
,
0
,
0
>
{}));
ThreadwiseDynamicTensorSliceTransfer_v1r3
<
ThreadwiseDynamicTensorSliceTransfer_v1r3
<
AccFloat
,
AccFloat
,
Float
,
Float
,
...
@@ -402,7 +459,10 @@ struct GridwiseDynamicGemm_km_kn_mn_v1
...
@@ -402,7 +459,10 @@ struct GridwiseDynamicGemm_km_kn_mn_v1
m_thread_data_on_global
%
M1
,
m_thread_data_on_global
%
M1
,
n_thread_data_on_global
/
N1
,
n_thread_data_on_global
/
N1
,
n_thread_data_on_global
%
N1
))
n_thread_data_on_global
%
N1
))
.
Run_hack
(
p_c_thread
,
c_m0_m1_n0_n1_global_desc
,
p_c_global
);
.
Run
(
p_c_thread
,
c_m0_m1_n0_n1_global_desc
,
p_c_global
,
c_m0_m1_n0_n1_global_tensor_iterator_hacks
);
}
}
}
}
...
@@ -435,5 +495,6 @@ struct GridwiseDynamicGemm_km_kn_mn_v1
...
@@ -435,5 +495,6 @@ struct GridwiseDynamicGemm_km_kn_mn_v1
integral_constant
<
bool
,
HasDoubleTailKBlockLoop
>
{});
integral_constant
<
bool
,
HasDoubleTailKBlockLoop
>
{});
}
}
};
};
}
// namespace ck
}
// namespace ck
#endif
#endif
composable_kernel/include/tensor_operation/threadwise_dynamic_tensor_slice_transfer.hpp
View file @
39821a90
...
@@ -31,7 +31,7 @@ struct ThreadwiseDynamicTensorSliceTransfer_v1r3
...
@@ -31,7 +31,7 @@ struct ThreadwiseDynamicTensorSliceTransfer_v1r3
using
DstCoord
=
decltype
(
make_dynamic_tensor_coordinate
(
DstDesc
{},
Index
{}));
using
DstCoord
=
decltype
(
make_dynamic_tensor_coordinate
(
DstDesc
{},
Index
{}));
using
DstCoord
S
te
p
=
decltype
(
make_dynamic_tensor_coordinate_
s
te
p
(
DstDesc
{},
Index
{}));
using
DstCoord
I
te
rator
=
decltype
(
make_dynamic_tensor_coordinate_
i
te
rator
(
DstDesc
{},
Index
{}));
__device__
constexpr
ThreadwiseDynamicTensorSliceTransfer_v1r3
(
__device__
constexpr
ThreadwiseDynamicTensorSliceTransfer_v1r3
(
const
DstDesc
&
dst_desc
,
const
Index
&
dst_slice_origin_idx
)
const
DstDesc
&
dst_desc
,
const
Index
&
dst_slice_origin_idx
)
...
@@ -49,8 +49,17 @@ struct ThreadwiseDynamicTensorSliceTransfer_v1r3
...
@@ -49,8 +49,17 @@ struct ThreadwiseDynamicTensorSliceTransfer_v1r3
dst_slice_origin_coord_
=
make_dynamic_tensor_coordinate
(
dst_desc
,
dst_slice_origin_idx
);
dst_slice_origin_coord_
=
make_dynamic_tensor_coordinate
(
dst_desc
,
dst_slice_origin_idx
);
}
}
__device__
void
Run_hack
(
const
SrcData
*
p_src
,
const
DstDesc
&
dst_desc
,
DstData
*
p_dst
)
template
<
typename
DstIteratorHacks
>
__device__
void
Run
(
const
SrcData
*
p_src
,
const
DstDesc
&
dst_desc
,
DstData
*
p_dst
,
const
DstIteratorHacks
&
dst_iterator_hacks
)
{
{
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I2
=
Number
<
2
>
{};
constexpr
auto
I3
=
Number
<
3
>
{};
// hardcoded for 4D
// hardcoded for 4D
// TODO implemente N-D
// TODO implemente N-D
static_assert
(
remove_reference_t
<
SrcDesc
>::
GetNumOfDimension
()
==
4
,
static_assert
(
remove_reference_t
<
SrcDesc
>::
GetNumOfDimension
()
==
4
,
...
@@ -60,14 +69,7 @@ struct ThreadwiseDynamicTensorSliceTransfer_v1r3
...
@@ -60,14 +69,7 @@ struct ThreadwiseDynamicTensorSliceTransfer_v1r3
Index
dst_scalar_per_access
;
Index
dst_scalar_per_access
;
static_for
<
0
,
nDim
,
1
>
{}([
&
](
auto
i
)
{
static_for
<
0
,
nDim
,
1
>
{}([
&
](
auto
i
)
{
if
constexpr
(
i
==
DstVectorDim
)
dst_scalar_per_access
(
i
)
=
(
i
==
DstVectorDim
)
?
DstScalarPerVector
:
1
;
{
dst_scalar_per_access
(
i
)
=
DstScalarPerVector
;
}
else
{
dst_scalar_per_access
(
i
)
=
1
;
}
});
});
return
dst_scalar_per_access
;
return
dst_scalar_per_access
;
...
@@ -76,16 +78,8 @@ struct ThreadwiseDynamicTensorSliceTransfer_v1r3
...
@@ -76,16 +78,8 @@ struct ThreadwiseDynamicTensorSliceTransfer_v1r3
constexpr
auto
dst_scalar_step_in_vector
=
[
&
]()
{
constexpr
auto
dst_scalar_step_in_vector
=
[
&
]()
{
Index
dst_scalar_step_in_vector
;
Index
dst_scalar_step_in_vector
;
static_for
<
0
,
nDim
,
1
>
{}([
&
](
auto
i
)
{
static_for
<
0
,
nDim
,
1
>
{}(
if
constexpr
(
i
==
DstVectorDim
)
[
&
](
auto
i
)
{
dst_scalar_step_in_vector
(
i
)
=
(
i
==
DstVectorDim
)
?
1
:
0
;
});
{
dst_scalar_step_in_vector
(
i
)
=
1
;
}
else
{
dst_scalar_step_in_vector
(
i
)
=
0
;
}
});
return
dst_scalar_step_in_vector
;
return
dst_scalar_step_in_vector
;
}();
}();
...
@@ -99,63 +93,41 @@ struct ThreadwiseDynamicTensorSliceTransfer_v1r3
...
@@ -99,63 +93,41 @@ struct ThreadwiseDynamicTensorSliceTransfer_v1r3
return
access_lengths
;
return
access_lengths
;
}();
}();
#if 0
const
auto
dst_forward_iterators
=
const auto dst_forward_steps =
make_tuple
(
make_dynamic_tensor_coordinate_iterator
(
dst_desc
,
make_tuple(make_dynamic_tensor_coordinate_step(
make_multi_index
(
1
,
0
,
0
,
0
)
*
dst_desc, make_multi_index(1, 0, 0, 0) * dst_scalar_per_access),
dst_scalar_per_access
,
make_dynamic_tensor_coordinate_step(
dst_iterator_hacks
[
I0
][
I0
]),
dst_desc, make_multi_index(0, 1, 0, 0) * dst_scalar_per_access),
make_dynamic_tensor_coordinate_iterator
(
dst_desc
,
make_dynamic_tensor_coordinate_step(
make_multi_index
(
0
,
1
,
0
,
0
)
*
dst_desc, make_multi_index(0, 0, 1, 0) * dst_scalar_per_access),
dst_scalar_per_access
,
make_dynamic_tensor_coordinate_step(
dst_iterator_hacks
[
I0
][
I1
]),
dst_desc, make_multi_index(0, 0, 0, 1) * dst_scalar_per_access),
make_dynamic_tensor_coordinate_iterator
(
dst_desc
,
make_multi_index
(
0
,
0
,
1
,
0
)
*
const auto dst_backward_steps =
dst_scalar_per_access
,
make_tuple(make_dynamic_tensor_coordinate_step(
dst_iterator_hacks
[
I0
][
I2
]),
dst_desc, make_multi_index(-1, 0, 0, 0) * dst_scalar_per_access),
make_dynamic_tensor_coordinate_iterator
(
dst_desc
,
make_dynamic_tensor_coordinate_step(
make_multi_index
(
0
,
0
,
0
,
1
)
*
dst_desc, make_multi_index(0, -1, 0, 0) * dst_scalar_per_access),
dst_scalar_per_access
,
make_dynamic_tensor_coordinate_step(
dst_iterator_hacks
[
I0
][
I3
]));
dst_desc, make_multi_index(0, 0, -1, 0) * dst_scalar_per_access),
make_dynamic_tensor_coordinate_step(
const
auto
dst_backward_iterators
=
dst_desc, make_multi_index(0, 0, 0, -1) * dst_scalar_per_access));
make_tuple
(
make_dynamic_tensor_coordinate_iterator
(
dst_desc
,
#else
make_multi_index
(
-
1
,
0
,
0
,
0
)
*
// hack for NKHW output tensor
dst_scalar_per_access
,
const
auto
dst_forward_steps
=
dst_iterator_hacks
[
I1
][
I0
]),
make_tuple
(
make_dynamic_tensor_coordinate_step
(
make_dynamic_tensor_coordinate_iterator
(
dst_desc
,
dst_desc
,
make_multi_index
(
1
,
0
,
0
,
0
)
*
dst_scalar_per_access
),
make_multi_index
(
0
,
-
1
,
0
,
0
)
*
make_dynamic_tensor_coordinate_step
(
dst_scalar_per_access
,
dst_desc
,
make_multi_index
(
0
,
1
,
0
,
0
)
*
dst_scalar_per_access
),
dst_iterator_hacks
[
I1
][
I1
]),
make_dynamic_tensor_coordinate_step_hack
(
dst_desc
,
make_dynamic_tensor_coordinate_iterator
(
dst_desc
,
make_multi_index
(
0
,
0
,
1
,
0
)
*
make_multi_index
(
0
,
0
,
-
1
,
0
)
*
dst_scalar_per_access
,
dst_scalar_per_access
,
Sequence
<
0
,
0
,
1
,
0
,
0
>
{}),
dst_iterator_hacks
[
I1
][
I2
]),
make_dynamic_tensor_coordinate_step_hack
(
dst_desc
,
make_dynamic_tensor_coordinate_iterator
(
dst_desc
,
make_multi_index
(
0
,
0
,
0
,
1
)
*
make_multi_index
(
0
,
0
,
0
,
-
1
)
*
dst_scalar_per_access
,
dst_scalar_per_access
,
Sequence
<
0
,
0
,
1
,
0
,
0
>
{}));
dst_iterator_hacks
[
I1
][
I3
]));
const
auto
dst_backward_steps
=
make_tuple
(
make_dynamic_tensor_coordinate_step
(
dst_desc
,
make_multi_index
(
-
1
,
0
,
0
,
0
)
*
dst_scalar_per_access
),
make_dynamic_tensor_coordinate_step
(
dst_desc
,
make_multi_index
(
0
,
-
1
,
0
,
0
)
*
dst_scalar_per_access
),
make_dynamic_tensor_coordinate_step_hack
(
dst_desc
,
make_multi_index
(
0
,
0
,
-
1
,
0
)
*
dst_scalar_per_access
,
Sequence
<
0
,
0
,
2
,
0
,
0
>
{}),
make_dynamic_tensor_coordinate_step_hack
(
dst_desc
,
make_multi_index
(
0
,
0
,
0
,
-
1
)
*
dst_scalar_per_access
,
Sequence
<
0
,
0
,
2
,
0
,
0
>
{}));
#endif
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I2
=
Number
<
2
>
{};
constexpr
auto
I3
=
Number
<
3
>
{};
index_t
counter
=
0
;
// loop over dim0
// loop over dim0
static_for
<
0
,
static_for
<
0
,
...
@@ -249,14 +221,14 @@ struct ThreadwiseDynamicTensorSliceTransfer_v1r3
...
@@ -249,14 +221,14 @@ struct ThreadwiseDynamicTensorSliceTransfer_v1r3
move_dynamic_tensor_coordinate
(
move_dynamic_tensor_coordinate
(
dst_desc
,
dst_desc
,
dst_slice_origin_coord_
,
dst_slice_origin_coord_
,
dst_forward_
s
te
p
s
[
DimAccessOrder
{}[
I3
]]);
dst_forward_
i
te
rator
s
[
DimAccessOrder
{}[
I3
]]);
}
}
else
else
{
{
move_dynamic_tensor_coordinate
(
move_dynamic_tensor_coordinate
(
dst_desc
,
dst_desc
,
dst_slice_origin_coord_
,
dst_slice_origin_coord_
,
dst_backward_
s
te
p
s
[
DimAccessOrder
{}[
I3
]]);
dst_backward_
i
te
rator
s
[
DimAccessOrder
{}[
I3
]]);
}
}
}
}
});
});
...
@@ -267,16 +239,17 @@ struct ThreadwiseDynamicTensorSliceTransfer_v1r3
...
@@ -267,16 +239,17 @@ struct ThreadwiseDynamicTensorSliceTransfer_v1r3
{
{
if
constexpr
(
forward_dim2
)
if
constexpr
(
forward_dim2
)
{
{
move_dynamic_tensor_coordinate
(
dst_desc
,
move_dynamic_tensor_coordinate
(
dst_slice_origin_coord_
,
dst_desc
,
dst_forward_steps
[
DimAccessOrder
{}[
I2
]]);
dst_slice_origin_coord_
,
dst_forward_iterators
[
DimAccessOrder
{}[
I2
]]);
}
}
else
else
{
{
move_dynamic_tensor_coordinate
(
move_dynamic_tensor_coordinate
(
dst_desc
,
dst_desc
,
dst_slice_origin_coord_
,
dst_slice_origin_coord_
,
dst_backward_
s
te
p
s
[
DimAccessOrder
{}[
I2
]]);
dst_backward_
i
te
rator
s
[
DimAccessOrder
{}[
I2
]]);
}
}
}
}
});
});
...
@@ -289,13 +262,14 @@ struct ThreadwiseDynamicTensorSliceTransfer_v1r3
...
@@ -289,13 +262,14 @@ struct ThreadwiseDynamicTensorSliceTransfer_v1r3
{
{
move_dynamic_tensor_coordinate
(
dst_desc
,
move_dynamic_tensor_coordinate
(
dst_desc
,
dst_slice_origin_coord_
,
dst_slice_origin_coord_
,
dst_forward_
s
te
p
s
[
DimAccessOrder
{}[
I1
]]);
dst_forward_
i
te
rator
s
[
DimAccessOrder
{}[
I1
]]);
}
}
else
else
{
{
move_dynamic_tensor_coordinate
(
dst_desc
,
move_dynamic_tensor_coordinate
(
dst_slice_origin_coord_
,
dst_desc
,
dst_backward_steps
[
DimAccessOrder
{}[
I1
]]);
dst_slice_origin_coord_
,
dst_backward_iterators
[
DimAccessOrder
{}[
I1
]]);
}
}
}
}
});
});
...
@@ -305,54 +279,47 @@ struct ThreadwiseDynamicTensorSliceTransfer_v1r3
...
@@ -305,54 +279,47 @@ struct ThreadwiseDynamicTensorSliceTransfer_v1r3
dst_scalar_per_access
[
DimAccessOrder
{}[
I0
]])
dst_scalar_per_access
[
DimAccessOrder
{}[
I0
]])
{
{
move_dynamic_tensor_coordinate
(
move_dynamic_tensor_coordinate
(
dst_desc
,
dst_slice_origin_coord_
,
dst_forward_
s
te
p
s
[
DimAccessOrder
{}[
I0
]]);
dst_desc
,
dst_slice_origin_coord_
,
dst_forward_
i
te
rator
s
[
DimAccessOrder
{}[
I0
]]);
}
}
});
});
// move dst coordinate back to slice origin (or not)
// move dst coordinate back to slice origin (or not)
if
constexpr
(
DstResetCoordinateAfterRun
)
if
constexpr
(
DstResetCoordinateAfterRun
)
{
{
const
auto
dst_
back_step
=
const
auto
dst_
reset_iterator
=
make_dynamic_tensor_coordinate_
s
te
p
(
dst_desc
,
GetDstCoordinate
Back
Step
());
make_dynamic_tensor_coordinate_
i
te
rator
(
dst_desc
,
GetDstCoordinate
Reset
Step
());
move_dynamic_tensor_coordinate
(
dst_desc
,
dst_slice_origin_coord_
,
dst_
back_step
);
move_dynamic_tensor_coordinate
(
dst_desc
,
dst_slice_origin_coord_
,
dst_
reset_iterator
);
}
}
}
}
__device__
static
constexpr
auto
GetDstCoordinate
Back
Step
()
__device__
static
constexpr
auto
GetDstCoordinate
Reset
Step
()
{
{
constexpr
auto
dst_scalar_per_access
=
[
&
]()
{
constexpr
auto
dst_scalar_per_access
=
[
&
]()
{
Index
dst_scalar_per_access
;
Index
dst_scalar_per_access
;
static_for
<
0
,
nDim
,
1
>
{}([
&
](
auto
i
)
{
static_for
<
0
,
nDim
,
1
>
{}([
&
](
auto
i
)
{
if
constexpr
(
i
==
DstVectorDim
)
dst_scalar_per_access
(
i
)
=
(
i
==
DstVectorDim
)
?
DstScalarPerVector
:
1
;
{
dst_scalar_per_access
(
i
)
=
DstScalarPerVector
;
}
else
{
dst_scalar_per_access
(
i
)
=
1
;
}
});
});
return
dst_scalar_per_access
;
return
dst_scalar_per_access
;
}();
}();
MultiIndex
<
nDim
>
dst_
back_step
;
MultiIndex
<
nDim
>
dst_
reset_iterator
;
// TODO: this is wrong, need to consider DimAccessOrder
// TODO: this is wrong, need to consider DimAccessOrder
dst_
back_step
(
Number
<
0
>
{})
=
dst_scalar_per_access
[
Number
<
0
>
{}]
-
SliceLengths
{}[
0
];
dst_
reset_iterator
(
Number
<
0
>
{})
=
dst_scalar_per_access
[
Number
<
0
>
{}]
-
SliceLengths
{}[
0
];
static_for
<
1
,
nDim
,
1
>
{}([
&
](
auto
i
)
{
static_for
<
1
,
nDim
,
1
>
{}([
&
](
auto
i
)
{
constexpr
auto
i_m1
=
i
-
Number
<
1
>
{};
constexpr
auto
i_m1
=
i
-
Number
<
1
>
{};
// TODO: this is wrong
// TODO: this is wrong
dst_
back_step
(
i
)
=
(
SliceLengths
{}[
i_m1
]
%
(
2
*
dst_scalar_per_access
[
i_m1
])
==
0
)
dst_
reset_iterator
(
i
)
=
(
SliceLengths
{}[
i_m1
]
%
(
2
*
dst_scalar_per_access
[
i_m1
])
==
0
)
?
0
?
0
:
(
dst_scalar_per_access
[
i
]
-
SliceLengths
{}[
i
]);
:
(
dst_scalar_per_access
[
i
]
-
SliceLengths
{}[
i
]);
});
});
return
dst_
back_step
;
return
dst_
reset_iterator
;
}
}
// dst_slice_origin_step_idx need to be known at compile-time, for performance reason
// dst_slice_origin_step_idx need to be known at compile-time, for performance reason
...
@@ -360,12 +327,13 @@ struct ThreadwiseDynamicTensorSliceTransfer_v1r3
...
@@ -360,12 +327,13 @@ struct ThreadwiseDynamicTensorSliceTransfer_v1r3
const
Index
&
dst_slice_origin_step_idx
)
const
Index
&
dst_slice_origin_step_idx
)
{
{
// if dst coord was not reset by RunWrite(), then need to adjust the step here
// if dst coord was not reset by RunWrite(), then need to adjust the step here
const
auto
adjusted_step_idx
=
DstResetCoordinateAfterRun
const
auto
adjusted_step_idx
=
?
dst_slice_origin_step_idx
DstResetCoordinateAfterRun
?
dst_slice_origin_step_idx
:
dst_slice_origin_step_idx
+
GetDstCoordinate
Back
Step
();
:
dst_slice_origin_step_idx
+
GetDstCoordinate
Reset
Step
();
// is it OK to construct a new step every time?
// is it OK to construct a new step every time?
const
auto
adjusted_step
=
make_dynamic_tensor_coordinate_step
(
dst_desc
,
adjusted_step_idx
);
const
auto
adjusted_step
=
make_dynamic_tensor_coordinate_iterator
(
dst_desc
,
adjusted_step_idx
);
move_dynamic_tensor_coordinate
(
dst_desc
,
dst_slice_origin_coord_
,
adjusted_step
);
move_dynamic_tensor_coordinate
(
dst_desc
,
dst_slice_origin_coord_
,
adjusted_step
);
}
}
...
@@ -410,8 +378,8 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
...
@@ -410,8 +378,8 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
using
SrcCoord
=
decltype
(
make_dynamic_tensor_coordinate
(
SrcDesc
{},
Index
{}));
using
SrcCoord
=
decltype
(
make_dynamic_tensor_coordinate
(
SrcDesc
{},
Index
{}));
using
DstCoord
=
decltype
(
make_dynamic_tensor_coordinate
(
DstDesc
{},
Index
{}));
using
DstCoord
=
decltype
(
make_dynamic_tensor_coordinate
(
DstDesc
{},
Index
{}));
using
SrcCoord
S
te
p
=
decltype
(
make_dynamic_tensor_coordinate_
s
te
p
(
SrcDesc
{},
Index
{}));
using
SrcCoord
I
te
rator
=
decltype
(
make_dynamic_tensor_coordinate_
i
te
rator
(
SrcDesc
{},
Index
{}));
using
DstCoord
S
te
p
=
decltype
(
make_dynamic_tensor_coordinate_
s
te
p
(
DstDesc
{},
Index
{}));
using
DstCoord
I
te
rator
=
decltype
(
make_dynamic_tensor_coordinate_
i
te
rator
(
DstDesc
{},
Index
{}));
__device__
constexpr
ThreadwiseDynamicTensorSliceTransfer_v3
(
const
SrcDesc
&
src_desc
,
__device__
constexpr
ThreadwiseDynamicTensorSliceTransfer_v3
(
const
SrcDesc
&
src_desc
,
const
Index
&
src_slice_origin
,
const
Index
&
src_slice_origin
,
...
@@ -444,8 +412,14 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
...
@@ -444,8 +412,14 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
dst_slice_origin_
=
make_dynamic_tensor_coordinate
(
dst_desc
,
dst_slice_origin_idx
);
dst_slice_origin_
=
make_dynamic_tensor_coordinate
(
dst_desc
,
dst_slice_origin_idx
);
}
}
__device__
void
RunRead
(
const
SrcDesc
&
src_desc
,
const
SrcData
*
p_src
)
template
<
typename
SrcIteratorHacks
>
__device__
void
RunRead
(
const
SrcDesc
&
src_desc
,
const
SrcData
*
p_src
,
const
SrcIteratorHacks
&
src_iterator_hacks
)
{
{
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
// hardcoded for 2D
// hardcoded for 2D
// TODO implemente N-D
// TODO implemente N-D
static_assert
(
remove_reference_t
<
SrcDesc
>::
GetNumOfDimension
()
==
2
,
static_assert
(
remove_reference_t
<
SrcDesc
>::
GetNumOfDimension
()
==
2
,
...
@@ -455,14 +429,7 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
...
@@ -455,14 +429,7 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
Index
src_scalar_per_access
;
Index
src_scalar_per_access
;
static_for
<
0
,
nDim
,
1
>
{}([
&
](
auto
i
)
{
static_for
<
0
,
nDim
,
1
>
{}([
&
](
auto
i
)
{
if
constexpr
(
i
==
SrcVectorDim
)
src_scalar_per_access
(
i
)
=
(
i
==
SrcVectorDim
)
?
SrcScalarPerVector
:
1
;
{
src_scalar_per_access
(
i
)
=
SrcScalarPerVector
;
}
else
{
src_scalar_per_access
(
i
)
=
1
;
}
});
});
return
src_scalar_per_access
;
return
src_scalar_per_access
;
...
@@ -471,35 +438,36 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
...
@@ -471,35 +438,36 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
constexpr
auto
src_scalar_step_in_vector
=
[
&
]()
{
constexpr
auto
src_scalar_step_in_vector
=
[
&
]()
{
Index
src_scalar_step_in_vector
;
Index
src_scalar_step_in_vector
;
static_for
<
0
,
nDim
,
1
>
{}([
&
](
auto
i
)
{
static_for
<
0
,
nDim
,
1
>
{}(
if
constexpr
(
i
==
SrcVectorDim
)
[
&
](
auto
i
)
{
src_scalar_step_in_vector
(
i
)
=
(
i
==
SrcVectorDim
)
?
1
:
0
;
});
{
src_scalar_step_in_vector
(
i
)
=
1
;
}
else
{
src_scalar_step_in_vector
(
i
)
=
0
;
}
});
return
src_scalar_step_in_vector
;
return
src_scalar_step_in_vector
;
}();
}();
// TODO use constexpr for coordinate-step to make sure compiler behave correctly
constexpr
auto
access_lengths
=
[
&
]()
{
const
auto
src_step_0_p
=
make_dynamic_tensor_coordinate_step
(
Index
access_lengths
;
src_desc
,
make_multi_index
(
0
,
1
)
*
src_scalar_per_access
);
const
auto
src_step_0_m
=
make_dynamic_tensor_coordinate_step
(
src_desc
,
make_multi_index
(
0
,
-
1
)
*
src_scalar_per_access
);
const
auto
src_step_p_0
=
make_dynamic_tensor_coordinate_step
(
static_for
<
0
,
nDim
,
1
>
{}
(
src_desc
,
make_multi_index
(
1
,
0
)
*
src_scalar_per_access
);
[
&
](
auto
i
)
{
access_lengths
(
i
)
=
SliceLengths
{}[
i
]
/
src_scalar_per_access
[
i
];
}
);
const
auto
src_step_m_0
=
make_dynamic_tensor_coordinate_step
(
return
access_lengths
;
src_desc
,
make_multi_index
(
-
1
,
0
)
*
src_scalar_per_access
);
}(
);
constexpr
auto
I0
=
Number
<
0
>
{};
const
auto
src_forward_iterators
=
make_tuple
(
constexpr
auto
I1
=
Number
<
1
>
{};
make_dynamic_tensor_coordinate_iterator
(
src_desc
,
make_multi_index
(
1
,
0
)
*
src_scalar_per_access
,
src_iterator_hacks
[
I0
][
I0
]),
make_dynamic_tensor_coordinate_iterator
(
src_desc
,
make_multi_index
(
0
,
1
)
*
src_scalar_per_access
,
src_iterator_hacks
[
I0
][
I1
]));
const
auto
src_backward_iterators
=
make_tuple
(
make_dynamic_tensor_coordinate_iterator
(
src_desc
,
make_multi_index
(
-
1
,
0
)
*
src_scalar_per_access
,
src_iterator_hacks
[
I1
][
I0
]),
make_dynamic_tensor_coordinate_iterator
(
src_desc
,
make_multi_index
(
0
,
-
1
)
*
src_scalar_per_access
,
src_iterator_hacks
[
I1
][
I1
]));
static_for
<
0
,
SliceLengths
{}[
I0
],
src_scalar_per_access
[
I0
]
>
{}([
&
](
auto
iter0
)
{
static_for
<
0
,
SliceLengths
{}[
I0
],
src_scalar_per_access
[
I0
]
>
{}([
&
](
auto
iter0
)
{
static_for
<
0
,
SliceLengths
{}[
I1
],
src_scalar_per_access
[
I1
]
>
{}([
&
](
auto
iter1
)
{
static_for
<
0
,
SliceLengths
{}[
I1
],
src_scalar_per_access
[
I1
]
>
{}([
&
](
auto
iter1
)
{
...
@@ -517,10 +485,11 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
...
@@ -517,10 +485,11 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
static_assert
(
SrcAddressSpace
==
AddressSpace
::
Global
,
static_assert
(
SrcAddressSpace
==
AddressSpace
::
Global
,
"wrong! hardcoded to use buffer_load, src must be global mem"
);
"wrong! hardcoded to use buffer_load, src must be global mem"
);
vector_type
<
SrcData
,
SrcScalarPerVector
>
src_vector
;
using
SrcVectorType
=
typename
vector_type
<
SrcData
,
SrcScalarPerVector
>::
MemoryType
;
using
SrcVectorType
=
typename
vector_type
<
SrcData
,
SrcScalarPerVector
>::
MemoryType
;
vector_type
<
SrcData
,
SrcScalarPerVector
>
src_vector
;
#if 1
src_vector
.
Vector
()
=
amd_buffer_load
<
SrcData
,
SrcScalarPerVector
>
(
src_vector
.
Vector
()
=
amd_buffer_load
<
SrcData
,
SrcScalarPerVector
>
(
p_src
,
src_slice_origin_
.
GetOffset
(),
true
,
src_desc
.
GetElementSpaceSize
());
p_src
,
src_slice_origin_
.
GetOffset
(),
true
,
src_desc
.
GetElementSpaceSize
());
...
@@ -535,38 +504,54 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
...
@@ -535,38 +504,54 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
buffer_
(
Number
<
buffer_offset
>
{})
=
src_vector
[
i
];
buffer_
(
Number
<
buffer_offset
>
{})
=
src_vector
[
i
];
});
});
#else
const
bool
is_valid
=
coordinate_has_valid_offset_assuming_visible_index_is_valid
(
src_desc
,
src_slice_origin_
);
src_vector
.
Vector
()
=
amd_buffer_load
<
SrcData
,
SrcScalarPerVector
>
(
p_src
,
src_slice_origin_
.
GetOffset
(),
is_valid
,
src_desc
.
GetElementSpaceSize
());
static_for
<
0
,
SrcScalarPerVector
,
1
>
{}([
&
](
auto
i
)
{
constexpr
index_t
buffer_offset
=
buffer_desc_
.
CalculateOffset
(
make_multi_index
(
i0
,
i1
)
+
i
*
src_scalar_step_in_vector
);
buffer_
(
Number
<
buffer_offset
>
{})
=
src_vector
[
i
];
});
#endif
// move dim1 iterator
// move dim1 iterator
if
constexpr
(
iter1
.
value
<
SliceL
engths
{}
[
I1
]
-
src_scalar_per_access
[
I1
]
)
if
constexpr
(
iter1
.
value
<
access_l
engths
[
I1
]
-
1
)
{
{
if
constexpr
(
forward_dim1
)
if
constexpr
(
forward_dim1
)
{
{
move_dynamic_tensor_coordinate
(
src_desc
,
src_slice_origin_
,
src_step_0_p
);
move_dynamic_tensor_coordinate
(
src_desc
,
src_slice_origin_
,
src_forward_iterators
[
I1
]);
}
}
else
else
{
{
move_dynamic_tensor_coordinate
(
src_desc
,
src_slice_origin_
,
src_step_0_m
);
move_dynamic_tensor_coordinate
(
src_desc
,
src_slice_origin_
,
src_backward_iterators
[
I1
]);
}
}
}
}
});
});
// move dim0 iterator
// move dim0 iterator
if
constexpr
(
iter0
.
value
<
SliceL
engths
{}
[
I0
]
-
src_scalar_per_access
[
I0
]
)
if
constexpr
(
iter0
.
value
<
access_l
engths
[
I0
]
-
1
)
{
{
move_dynamic_tensor_coordinate
(
src_desc
,
src_slice_origin_
,
src_step_p_0
);
move_dynamic_tensor_coordinate
(
src_desc
,
src_slice_origin_
,
src_forward_iterators
[
I0
]);
}
}
});
});
// move src coordinate back to its slice origin
// move src coordinate back to its slice origin
if
constexpr
(
SrcResetCoordinateAfterRun
)
if
constexpr
(
SrcResetCoordinateAfterRun
)
{
{
const
auto
src_
back_step
=
const
auto
src_
reset_iterator
=
make_dynamic_tensor_coordinate_
s
te
p
(
src_desc
,
GetSrcCoordinate
Back
Step
());
make_dynamic_tensor_coordinate_
i
te
rator
(
src_desc
,
GetSrcCoordinate
Reset
Step
());
move_dynamic_tensor_coordinate
(
src_desc
,
src_slice_origin_
,
src_
back_step
);
move_dynamic_tensor_coordinate
(
src_desc
,
src_slice_origin_
,
src_
reset_iterator
);
}
}
}
}
__device__
void
RunWrite
(
const
DstDesc
&
dst_desc
,
DstData
*
p_dst
)
__device__
void
RunWrite
(
const
DstDesc
&
dst_desc
,
DstData
*
p_dst
)
{
{
static_assert
(
remove_reference_t
<
DstDesc
>::
GetNumOfDimension
()
==
2
,
static_assert
(
remove_reference_t
<
DstDesc
>::
GetNumOfDimension
()
==
2
,
...
@@ -578,14 +563,14 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
...
@@ -578,14 +563,14 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
{
{
// TODO use constexpr for coordinate-step to make sure compiler behave correctly
// TODO use constexpr for coordinate-step to make sure compiler behave correctly
const
auto
dst_step_0_p
=
const
auto
dst_step_0_p
=
make_dynamic_tensor_coordinate_
s
te
p
(
dst_desc
,
make_multi_index
(
0
,
1
));
make_dynamic_tensor_coordinate_
i
te
rator
(
dst_desc
,
make_multi_index
(
0
,
1
));
const
auto
dst_step_0_m
=
const
auto
dst_step_0_m
=
make_dynamic_tensor_coordinate_
s
te
p
(
dst_desc
,
make_multi_index
(
0
,
-
1
));
make_dynamic_tensor_coordinate_
i
te
rator
(
dst_desc
,
make_multi_index
(
0
,
-
1
));
const
auto
dst_step_p_0
=
const
auto
dst_step_p_0
=
make_dynamic_tensor_coordinate_
s
te
p
(
dst_desc
,
make_multi_index
(
1
,
0
));
make_dynamic_tensor_coordinate_
i
te
rator
(
dst_desc
,
make_multi_index
(
1
,
0
));
const
auto
dst_step_m_0
=
const
auto
dst_step_m_0
=
make_dynamic_tensor_coordinate_
s
te
p
(
dst_desc
,
make_multi_index
(
-
1
,
0
));
make_dynamic_tensor_coordinate_
i
te
rator
(
dst_desc
,
make_multi_index
(
-
1
,
0
));
constexpr
index_t
Len0
=
SliceLengths
{}[
0
];
constexpr
index_t
Len0
=
SliceLengths
{}[
0
];
constexpr
index_t
Len1
=
SliceLengths
{}[
1
];
constexpr
index_t
Len1
=
SliceLengths
{}[
1
];
...
@@ -637,20 +622,15 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
...
@@ -637,20 +622,15 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
// move dst coordinate back to its slice origin
// move dst coordinate back to its slice origin
if
constexpr
(
DstResetCoordinateAfterRun
)
if
constexpr
(
DstResetCoordinateAfterRun
)
{
{
const
auto
dst_
back_step
=
const
auto
dst_
reset_iterator
=
make_dynamic_tensor_coordinate_
s
te
p
(
dst_desc
,
GetDstCoordinate
Back
Step
());
make_dynamic_tensor_coordinate_
i
te
rator
(
dst_desc
,
GetDstCoordinate
Reset
Step
());
move_dynamic_tensor_coordinate
(
dst_desc
,
dst_slice_origin_
,
dst_
back_step
);
move_dynamic_tensor_coordinate
(
dst_desc
,
dst_slice_origin_
,
dst_
reset_iterator
);
}
}
}
}
__device__
void
RunRead_hack
(
const
SrcDesc
&
src_desc
,
const
SrcData
*
p_src
)
__device__
static
constexpr
auto
GetSrcCoordinateResetStep
(
)
{
{
// hardcoded for 2D
// TODO implemente N-D
static_assert
(
remove_reference_t
<
SrcDesc
>::
GetNumOfDimension
()
==
2
,
"wrong! hardcoded for 2D tensor"
);
constexpr
auto
src_scalar_per_access
=
[
&
]()
{
constexpr
auto
src_scalar_per_access
=
[
&
]()
{
Index
src_scalar_per_access
;
Index
src_scalar_per_access
;
...
@@ -668,235 +648,22 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
...
@@ -668,235 +648,22 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
return
src_scalar_per_access
;
return
src_scalar_per_access
;
}();
}();
constexpr
auto
src_scalar_step_in_vector
=
[
&
]()
{
MultiIndex
<
nDim
>
src_reset_iterator
;
Index
src_scalar_step_in_vector
;
static_for
<
0
,
nDim
,
1
>
{}([
&
](
auto
i
)
{
if
constexpr
(
i
==
SrcVectorDim
)
{
src_scalar_step_in_vector
(
i
)
=
1
;
}
else
{
src_scalar_step_in_vector
(
i
)
=
0
;
}
});
return
src_scalar_step_in_vector
;
}();
constexpr
auto
access_lengths
=
[
&
]()
{
Index
access_lengths
;
static_for
<
0
,
nDim
,
1
>
{}(
src_reset_iterator
(
Number
<
0
>
{})
=
src_scalar_per_access
[
Number
<
0
>
{}]
-
SliceLengths
{}[
0
];
[
&
](
auto
i
)
{
access_lengths
(
i
)
=
SliceLengths
{}[
i
]
/
src_scalar_per_access
[
i
];
});
return
access_lengths
;
}();
#if 0 // hack
// TODO use constexpr for coordinate-step to make sure compiler behave correctly
const auto src_step_0_p = make_dynamic_tensor_coordinate_step(
src_desc, make_multi_index(0, 1) * src_scalar_per_access);
const auto src_step_0_m = make_dynamic_tensor_coordinate_step(
src_desc, make_multi_index(0, -1) * src_scalar_per_access);
const auto src_step_p_0 = make_dynamic_tensor_coordinate_step(
src_desc, make_multi_index(1, 0) * src_scalar_per_access);
const auto src_step_m_0 = make_dynamic_tensor_coordinate_step(
src_desc, make_multi_index(-1, 0) * src_scalar_per_access);
#elif
1
// for padded input tensor
const
auto
src_step_0_p
=
make_dynamic_tensor_coordinate_step_hack
(
src_desc
,
make_multi_index
(
0
,
1
)
*
src_scalar_per_access
,
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
1
>
{});
const
auto
src_step_0_m
=
make_dynamic_tensor_coordinate_step_hack
(
src_desc
,
make_multi_index
(
0
,
-
1
)
*
src_scalar_per_access
,
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
2
>
{});
const
auto
src_step_p_0
=
make_dynamic_tensor_coordinate_step_hack
(
src_desc
,
make_multi_index
(
1
,
0
)
*
src_scalar_per_access
,
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
1
,
0
>
{});
const
auto
src_step_m_0
=
make_dynamic_tensor_coordinate_step_hack
(
src_desc
,
make_multi_index
(
-
1
,
0
)
*
src_scalar_per_access
,
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
2
,
0
>
{});
#elif 0
// for non-padded input tensor
const
auto
src_step_0_p
=
make_dynamic_tensor_coordinate_step_hack
(
src_desc
,
make_multi_index
(
0
,
1
)
*
src_scalar_per_access
,
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
1
>
{});
const
auto
src_step_0_m
=
make_dynamic_tensor_coordinate_step_hack
(
src_desc
,
make_multi_index
(
0
,
-
1
)
*
src_scalar_per_access
,
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
2
>
{});
const
auto
src_step_p_0
=
make_dynamic_tensor_coordinate_step_hack
(
src_desc
,
make_multi_index
(
1
,
0
)
*
src_scalar_per_access
,
Sequence
<
0
,
0
,
0
,
0
,
0
,
1
,
0
>
{});
const
auto
src_step_m_0
=
make_dynamic_tensor_coordinate_step_hack
(
src_desc
,
make_multi_index
(
-
1
,
0
)
*
src_scalar_per_access
,
Sequence
<
0
,
0
,
0
,
0
,
0
,
2
,
0
>
{});
#elif 1
// for 1x1, input tensor
const
auto
src_step_0_p
=
make_dynamic_tensor_coordinate_step_hack
(
src_desc
,
make_multi_index
(
0
,
1
)
*
src_scalar_per_access
,
Sequence
<
0
,
0
,
1
>
{});
const
auto
src_step_0_m
=
make_dynamic_tensor_coordinate_step_hack
(
src_desc
,
make_multi_index
(
0
,
-
1
)
*
src_scalar_per_access
,
Sequence
<
0
,
0
,
2
>
{});
const
auto
src_step_p_0
=
make_dynamic_tensor_coordinate_step
(
src_desc
,
make_multi_index
(
1
,
0
)
*
src_scalar_per_access
);
const
auto
src_step_m_0
=
make_dynamic_tensor_coordinate_step
(
src_desc
,
make_multi_index
(
-
1
,
0
)
*
src_scalar_per_access
);
#endif
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
static_for
<
0
,
SliceLengths
{}[
I0
],
src_scalar_per_access
[
I0
]
>
{}([
&
](
auto
iter0
)
{
static_for
<
0
,
SliceLengths
{}[
I1
],
src_scalar_per_access
[
I1
]
>
{}([
&
](
auto
iter1
)
{
// step direction
constexpr
bool
forward_dim1
=
(
iter0
.
value
%
(
2
*
src_scalar_per_access
[
I0
])
==
0
);
constexpr
index_t
i0
=
iter0
.
value
;
constexpr
index_t
i1
=
forward_dim1
?
iter1
.
value
:
SliceLengths
{}[
I1
]
-
src_scalar_per_access
[
I1
]
-
iter1
.
value
;
// do work
// hardcoding for buffer_load
// TODO refactor transfer_data() to encapsulate this
static_assert
(
SrcAddressSpace
==
AddressSpace
::
Global
,
"wrong! hardcoded to use buffer_load, src must be global mem"
);
using
SrcVectorType
=
typename
vector_type
<
SrcData
,
SrcScalarPerVector
>::
MemoryType
;
vector_type
<
SrcData
,
SrcScalarPerVector
>
src_vector
;
#if 1
src_vector
.
Vector
()
=
amd_buffer_load
<
SrcData
,
SrcScalarPerVector
>
(
p_src
,
src_slice_origin_
.
GetOffset
(),
true
,
src_desc
.
GetElementSpaceSize
());
const
bool
is_valid
=
coordinate_has_valid_offset_assuming_visible_index_is_valid
(
src_desc
,
src_slice_origin_
);
src_vector
.
Vector
()
=
is_valid
?
src_vector
.
Vector
()
:
SrcVectorType
{
0
};
static_for
<
0
,
SrcScalarPerVector
,
1
>
{}([
&
](
auto
i
)
{
constexpr
index_t
buffer_offset
=
buffer_desc_
.
CalculateOffset
(
make_multi_index
(
i0
,
i1
)
+
i
*
src_scalar_step_in_vector
);
buffer_
(
Number
<
buffer_offset
>
{})
=
src_vector
[
i
];
});
#elif 1
const
bool
is_valid
=
coordinate_has_valid_offset_assuming_visible_index_is_valid
(
src_desc
,
src_slice_origin_
);
src_vector
.
Vector
()
=
amd_buffer_load
<
SrcData
,
SrcScalarPerVector
>
(
p_src
,
src_slice_origin_
.
GetOffset
(),
is_valid
,
src_desc
.
GetElementSpaceSize
());
static_for
<
0
,
SrcScalarPerVector
,
1
>
{}([
&
](
auto
i
)
{
constexpr
index_t
buffer_offset
=
buffer_desc_
.
CalculateOffset
(
make_multi_index
(
i0
,
i1
)
+
i
*
src_scalar_step_in_vector
);
buffer_
(
Number
<
buffer_offset
>
{})
=
src_vector
[
i
];
});
#elif 0
static_for
<
0
,
SrcScalarPerVector
,
1
>
{}([
&
](
auto
i
)
{
constexpr
index_t
buffer_offset
=
buffer_desc_
.
CalculateOffset
(
make_multi_index
(
i0
,
i1
)
+
i
*
src_scalar_step_in_vector
);
int32x2_t
is_valid_i32
=
is_valid
;
asm
volatile
(
"
\n
\
v_cmp_gt_u32_e64 is_valid_flag, is_valid_i32, 0
\n
\
v_cndmask_b32_e64 src_data, 0, src_data, is_valid_flag
\n
\
"
:
"=s"
(
is_valid_flag
),
"=v"
(
src_data
),
:
"v"
(
is_valid_i32
),
"2"
(
is_valid_flag
),
"3"
(
src_data
));
buffer_
(
Number
<
buffer_offset
>
{})
=
src_data
;
});
#endif
// move dim1 iterator
if
constexpr
(
iter1
.
value
<
access_lengths
[
I1
]
-
1
)
{
if
constexpr
(
forward_dim1
)
{
move_dynamic_tensor_coordinate
(
src_desc
,
src_slice_origin_
,
src_step_0_p
);
}
else
{
move_dynamic_tensor_coordinate
(
src_desc
,
src_slice_origin_
,
src_step_0_m
);
}
}
});
// move dim0 iterator
if
constexpr
(
iter0
.
value
<
access_lengths
[
I0
]
-
1
)
{
move_dynamic_tensor_coordinate
(
src_desc
,
src_slice_origin_
,
src_step_p_0
);
}
});
// move src coordinate back to its slice origin
if
constexpr
(
SrcResetCoordinateAfterRun
)
{
const
auto
src_back_step
=
make_dynamic_tensor_coordinate_step
(
src_desc
,
GetSrcCoordinateBackStep
());
move_dynamic_tensor_coordinate
(
src_desc
,
src_slice_origin_
,
src_back_step
);
}
}
__device__
static
constexpr
auto
GetSrcCoordinateBackStep
()
{
constexpr
auto
src_scalar_per_access
=
[
&
]()
{
Index
src_scalar_per_access
;
static_for
<
0
,
nDim
,
1
>
{}([
&
](
auto
i
)
{
if
constexpr
(
i
==
SrcVectorDim
)
{
src_scalar_per_access
(
i
)
=
SrcScalarPerVector
;
}
else
{
src_scalar_per_access
(
i
)
=
1
;
}
});
return
src_scalar_per_access
;
}();
MultiIndex
<
nDim
>
src_back_step
;
src_back_step
(
Number
<
0
>
{})
=
src_scalar_per_access
[
Number
<
0
>
{}]
-
SliceLengths
{}[
0
];
static_for
<
1
,
nDim
,
1
>
{}([
&
](
auto
i
)
{
static_for
<
1
,
nDim
,
1
>
{}([
&
](
auto
i
)
{
constexpr
auto
i_m1
=
i
-
Number
<
1
>
{};
constexpr
auto
i_m1
=
i
-
Number
<
1
>
{};
src_
back_step
(
i
)
=
(
SliceLengths
{}[
i_m1
]
%
(
2
*
src_scalar_per_access
[
i_m1
])
==
0
)
src_
reset_iterator
(
i
)
=
(
SliceLengths
{}[
i_m1
]
%
(
2
*
src_scalar_per_access
[
i_m1
])
==
0
)
?
0
?
0
:
(
src_scalar_per_access
[
i
]
-
SliceLengths
{}[
i
]);
:
(
src_scalar_per_access
[
i
]
-
SliceLengths
{}[
i
]);
});
});
return
src_
back_step
;
return
src_
reset_iterator
;
}
}
__device__
static
constexpr
auto
GetDstCoordinate
Back
Step
()
__device__
static
constexpr
auto
GetDstCoordinate
Reset
Step
()
{
{
constexpr
auto
dst_scalar_per_access
=
[
&
]()
{
constexpr
auto
dst_scalar_per_access
=
[
&
]()
{
Index
dst_scalar_per_access
;
Index
dst_scalar_per_access
;
...
@@ -915,19 +682,19 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
...
@@ -915,19 +682,19 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
return
dst_scalar_per_access
;
return
dst_scalar_per_access
;
}();
}();
MultiIndex
<
nDim
>
dst_
back_step
;
MultiIndex
<
nDim
>
dst_
reset_iterator
;
dst_
back_step
(
Number
<
0
>
{})
=
dst_scalar_per_access
[
Number
<
0
>
{}]
-
SliceLengths
{}[
0
];
dst_
reset_iterator
(
Number
<
0
>
{})
=
dst_scalar_per_access
[
Number
<
0
>
{}]
-
SliceLengths
{}[
0
];
static_for
<
1
,
nDim
,
1
>
{}([
&
](
auto
i
)
{
static_for
<
1
,
nDim
,
1
>
{}([
&
](
auto
i
)
{
constexpr
auto
i_m1
=
i
-
Number
<
1
>
{};
constexpr
auto
i_m1
=
i
-
Number
<
1
>
{};
dst_
back_step
(
i
)
=
(
SliceLengths
{}[
i_m1
]
%
(
2
*
dst_scalar_per_access
[
i_m1
])
==
0
)
dst_
reset_iterator
(
i
)
=
(
SliceLengths
{}[
i_m1
]
%
(
2
*
dst_scalar_per_access
[
i_m1
])
==
0
)
?
0
?
0
:
(
dst_scalar_per_access
[
i
]
-
SliceLengths
{}[
i
]);
:
(
dst_scalar_per_access
[
i
]
-
SliceLengths
{}[
i
]);
});
});
return
dst_
back_step
;
return
dst_
reset_iterator
;
}
}
// src_slice_origin_step_idx need to be known at compile-time, for performance reason
// src_slice_origin_step_idx need to be known at compile-time, for performance reason
...
@@ -935,59 +702,50 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
...
@@ -935,59 +702,50 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
const
Index
&
src_slice_origin_step_idx
)
const
Index
&
src_slice_origin_step_idx
)
{
{
// if src coord was not reset by RunRead(), then need to adjust the step here
// if src coord was not reset by RunRead(), then need to adjust the step here
const
auto
adjusted_step_idx
=
SrcResetCoordinateAfterRun
const
auto
adjusted_step_idx
=
?
src_slice_origin_step_idx
SrcResetCoordinateAfterRun
?
src_slice_origin_step_idx
:
src_slice_origin_step_idx
+
GetSrcCoordinate
Back
Step
();
:
src_slice_origin_step_idx
+
GetSrcCoordinate
Reset
Step
();
// is it OK to construct a new step every time?
// is it OK to construct a new step every time?
const
auto
adjusted_step
=
make_dynamic_tensor_coordinate_step
(
src_desc
,
adjusted_step_idx
);
const
auto
adjusted_step
=
make_dynamic_tensor_coordinate_iterator
(
src_desc
,
adjusted_step_idx
);
move_dynamic_tensor_coordinate
(
src_desc
,
src_slice_origin_
,
adjusted_step
);
move_dynamic_tensor_coordinate
(
src_desc
,
src_slice_origin_
,
adjusted_step
);
}
}
// dst_slice_origin_step_idx need to be known at compile-time, for performance reason
// src_slice_origin_step_idx need to be known at compile-time, for performance reason
__device__
void
MoveDstSliceWindow
(
const
DstDesc
&
dst_desc
,
template
<
typename
SrcMoveSliceWindowIteratorHack
>
const
Index
&
dst_slice_origin_step_idx
)
__device__
void
MoveSrcSliceWindow
(
const
SrcDesc
&
src_desc
,
const
Index
&
src_slice_origin_step_idx
,
const
SrcMoveSliceWindowIteratorHack
&
src_move_slice_window_iterator_hack
)
{
{
// if
dst
coord was not reset by Run
Write
(), then need to adjust the step here
// if
src
coord was not reset by Run
Read
(), then need to adjust the step here
const
auto
adjusted_step_idx
=
DstResetCoordinateAfterRun
const
auto
adjusted_step_idx
=
?
dst
_slice_origin_step_idx
SrcResetCoordinateAfterRun
?
src
_slice_origin_step_idx
:
dst
_slice_origin_step_idx
+
Get
Dst
Coordinate
Back
Step
();
:
src
_slice_origin_step_idx
+
Get
Src
Coordinate
Reset
Step
();
// is it OK to construct a new step every time?
// is it OK to construct a new step every time?
const
auto
adjusted_step
=
make_dynamic_tensor_coordinate_step
(
dst_desc
,
adjusted_step_idx
);
const
auto
adjusted_step
=
make_dynamic_tensor_coordinate_iterator
(
src_desc
,
adjusted_step_idx
,
src_move_slice_window_iterator_hack
);
move_dynamic_tensor_coordinate
(
dst
_desc
,
dst
_slice_origin_
,
adjusted_step
);
move_dynamic_tensor_coordinate
(
src
_desc
,
src
_slice_origin_
,
adjusted_step
);
}
}
//
src
_slice_origin_step_idx need to be known at compile-time, for performance reason
//
dst
_slice_origin_step_idx need to be known at compile-time, for performance reason
__device__
void
Move
Src
SliceWindow
_hack
(
const
Src
Desc
&
src
_desc
,
__device__
void
Move
Dst
SliceWindow
(
const
Dst
Desc
&
dst
_desc
,
const
Index
&
src
_slice_origin_step_idx
)
const
Index
&
dst
_slice_origin_step_idx
)
{
{
// if
src
coord was not reset by Run
Read
(), then need to adjust the step here
// if
dst
coord was not reset by Run
Write
(), then need to adjust the step here
const
auto
adjusted_step_idx
=
SrcResetCoordinateAfterRun
const
auto
adjusted_step_idx
=
?
src
_slice_origin_step_idx
DstResetCoordinateAfterRun
?
dst
_slice_origin_step_idx
:
src
_slice_origin_step_idx
+
Get
Src
Coordinate
Back
Step
();
:
dst
_slice_origin_step_idx
+
Get
Dst
Coordinate
Reset
Step
();
// is it OK to construct a new step every time?
// is it OK to construct a new step every time?
#if 0 // hack
const
auto
adjusted_step
=
const auto adjusted_step = make_dynamic_tensor_coordinate_step(
make_dynamic_tensor_coordinate_iterator
(
dst_desc
,
adjusted_step_idx
);
src_desc, adjusted_step_idx);
#elif
1
// for padded input tensor
const
auto
adjusted_step
=
make_dynamic_tensor_coordinate_step_hack
(
src_desc
,
adjusted_step_idx
,
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
1
,
2
>
{});
#elif 0
// for non-padded input tensor
const
auto
adjusted_step
=
make_dynamic_tensor_coordinate_step_hack
(
src_desc
,
adjusted_step_idx
,
Sequence
<
0
,
0
,
0
,
0
,
0
,
1
,
2
>
{});
#elif 1
// for 1x1, input tensor
const
auto
adjusted_step
=
make_dynamic_tensor_coordinate_step_hack
(
src_desc
,
adjusted_step_idx
,
Sequence
<
0
,
1
,
2
>
{});
#endif
move_dynamic_tensor_coordinate
(
src
_desc
,
src
_slice_origin_
,
adjusted_step
);
move_dynamic_tensor_coordinate
(
dst
_desc
,
dst
_slice_origin_
,
adjusted_step
);
}
}
private:
private:
...
...
driver/include/device_dummy_dynamic_transform.hpp
View file @
39821a90
...
@@ -55,11 +55,12 @@ void device_dummy_dynamic_transform(InDesc,
...
@@ -55,11 +55,12 @@ void device_dummy_dynamic_transform(InDesc,
auto
in_gemmk_gemmn_gemmkpack_coord
=
make_dynamic_tensor_coordinate
(
auto
in_gemmk_gemmn_gemmkpack_coord
=
make_dynamic_tensor_coordinate
(
in_gemmk_gemmn_gemmkpack_global_desc
,
make_multi_index
(
0
,
0
,
0
));
in_gemmk_gemmn_gemmkpack_global_desc
,
make_multi_index
(
0
,
0
,
0
));
const
auto
in_gemmk_gemmn_gemmkpack_coord_step_0_0_1
=
make_dynamic_tensor_coordinate_step
(
const
auto
in_gemmk_gemmn_gemmkpack_coord_iterator_0_0_1
=
in_gemmk_gemmn_gemmkpack_global_desc
,
make_multi_index
(
0
,
0
,
1
));
make_dynamic_tensor_coordinate_iterator
(
in_gemmk_gemmn_gemmkpack_global_desc
,
make_multi_index
(
0
,
0
,
1
));
print_array_v2
(
"do_tansforms 0 0 1: "
,
print_array_v2
(
"do_tansforms 0 0 1: "
,
in_gemmk_gemmn_gemmkpack_coord_
s
te
p
_0_0_1
.
do_transforms_
);
in_gemmk_gemmn_gemmkpack_coord_
i
te
rator
_0_0_1
.
do_transforms_
);
for
(
index_t
iter
=
0
;
iter
<
10
;
++
iter
)
for
(
index_t
iter
=
0
;
iter
<
10
;
++
iter
)
{
{
...
@@ -71,7 +72,7 @@ void device_dummy_dynamic_transform(InDesc,
...
@@ -71,7 +72,7 @@ void device_dummy_dynamic_transform(InDesc,
move_dynamic_tensor_coordinate
(
in_gemmk_gemmn_gemmkpack_global_desc
,
move_dynamic_tensor_coordinate
(
in_gemmk_gemmn_gemmkpack_global_desc
,
in_gemmk_gemmn_gemmkpack_coord
,
in_gemmk_gemmn_gemmkpack_coord
,
in_gemmk_gemmn_gemmkpack_coord_
s
te
p
_0_0_1
);
in_gemmk_gemmn_gemmkpack_coord_
i
te
rator
_0_0_1
);
}
}
}
}
...
@@ -79,11 +80,12 @@ void device_dummy_dynamic_transform(InDesc,
...
@@ -79,11 +80,12 @@ void device_dummy_dynamic_transform(InDesc,
auto
in_gemmk_gemmn_gemmkpack_coord
=
make_dynamic_tensor_coordinate
(
auto
in_gemmk_gemmn_gemmkpack_coord
=
make_dynamic_tensor_coordinate
(
in_gemmk_gemmn_gemmkpack_global_desc
,
make_multi_index
(
0
,
0
,
0
));
in_gemmk_gemmn_gemmkpack_global_desc
,
make_multi_index
(
0
,
0
,
0
));
const
auto
in_gemmk_gemmn_gemmkpack_coord_step_0_1_0
=
make_dynamic_tensor_coordinate_step
(
const
auto
in_gemmk_gemmn_gemmkpack_coord_iterator_0_1_0
=
in_gemmk_gemmn_gemmkpack_global_desc
,
make_multi_index
(
0
,
1
,
0
));
make_dynamic_tensor_coordinate_iterator
(
in_gemmk_gemmn_gemmkpack_global_desc
,
make_multi_index
(
0
,
1
,
0
));
print_array_v2
(
"do_tansforms 0 1 0: "
,
print_array_v2
(
"do_tansforms 0 1 0: "
,
in_gemmk_gemmn_gemmkpack_coord_
s
te
p
_0_1_0
.
do_transforms_
);
in_gemmk_gemmn_gemmkpack_coord_
i
te
rator
_0_1_0
.
do_transforms_
);
for
(
index_t
iter
=
0
;
iter
<
10
;
++
iter
)
for
(
index_t
iter
=
0
;
iter
<
10
;
++
iter
)
{
{
...
@@ -95,7 +97,7 @@ void device_dummy_dynamic_transform(InDesc,
...
@@ -95,7 +97,7 @@ void device_dummy_dynamic_transform(InDesc,
move_dynamic_tensor_coordinate
(
in_gemmk_gemmn_gemmkpack_global_desc
,
move_dynamic_tensor_coordinate
(
in_gemmk_gemmn_gemmkpack_global_desc
,
in_gemmk_gemmn_gemmkpack_coord
,
in_gemmk_gemmn_gemmkpack_coord
,
in_gemmk_gemmn_gemmkpack_coord_
s
te
p
_0_1_0
);
in_gemmk_gemmn_gemmkpack_coord_
i
te
rator
_0_1_0
);
}
}
}
}
...
@@ -103,11 +105,12 @@ void device_dummy_dynamic_transform(InDesc,
...
@@ -103,11 +105,12 @@ void device_dummy_dynamic_transform(InDesc,
auto
in_gemmk_gemmn_gemmkpack_coord
=
make_dynamic_tensor_coordinate
(
auto
in_gemmk_gemmn_gemmkpack_coord
=
make_dynamic_tensor_coordinate
(
in_gemmk_gemmn_gemmkpack_global_desc
,
make_multi_index
(
0
,
0
,
0
));
in_gemmk_gemmn_gemmkpack_global_desc
,
make_multi_index
(
0
,
0
,
0
));
const
auto
in_gemmk_gemmn_gemmkpack_coord_step_1_0_0
=
make_dynamic_tensor_coordinate_step
(
const
auto
in_gemmk_gemmn_gemmkpack_coord_iterator_1_0_0
=
in_gemmk_gemmn_gemmkpack_global_desc
,
make_multi_index
(
1
,
0
,
0
));
make_dynamic_tensor_coordinate_iterator
(
in_gemmk_gemmn_gemmkpack_global_desc
,
make_multi_index
(
1
,
0
,
0
));
print_array_v2
(
"do_tansforms 1 0 0: "
,
print_array_v2
(
"do_tansforms 1 0 0: "
,
in_gemmk_gemmn_gemmkpack_coord_
s
te
p
_1_0_0
.
do_transforms_
);
in_gemmk_gemmn_gemmkpack_coord_
i
te
rator
_1_0_0
.
do_transforms_
);
for
(
index_t
iter
=
0
;
iter
<
10
;
++
iter
)
for
(
index_t
iter
=
0
;
iter
<
10
;
++
iter
)
{
{
...
@@ -119,7 +122,7 @@ void device_dummy_dynamic_transform(InDesc,
...
@@ -119,7 +122,7 @@ void device_dummy_dynamic_transform(InDesc,
move_dynamic_tensor_coordinate
(
in_gemmk_gemmn_gemmkpack_global_desc
,
move_dynamic_tensor_coordinate
(
in_gemmk_gemmn_gemmkpack_global_desc
,
in_gemmk_gemmn_gemmkpack_coord
,
in_gemmk_gemmn_gemmkpack_coord
,
in_gemmk_gemmn_gemmkpack_coord_
s
te
p
_1_0_0
);
in_gemmk_gemmn_gemmkpack_coord_
i
te
rator
_1_0_0
);
}
}
}
}
...
...
driver/include/device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
View file @
39821a90
...
@@ -233,7 +233,7 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc
...
@@ -233,7 +233,7 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc
printf
(
"%s: BlockSize %u, GridSize %u
\n
"
,
__func__
,
BlockSize
,
GridSize
);
printf
(
"%s: BlockSize %u, GridSize %u
\n
"
,
__func__
,
BlockSize
,
GridSize
);
constexpr
auto
conv_driver
=
constexpr
auto
conv_driver
=
#if
1
#if
0
DriverDynamicConvolutionForwardImplicitGemm_v4r4_nchw_kcyx_nkhw_pad
DriverDynamicConvolutionForwardImplicitGemm_v4r4_nchw_kcyx_nkhw_pad
#elif
0
#elif
0
DriverDynamicConvolutionForwardImplicitGemm_v4r4_nchw_kcyx_nkhw_no_pad
DriverDynamicConvolutionForwardImplicitGemm_v4r4_nchw_kcyx_nkhw_no_pad
...
...
driver/src/conv_driver.cpp
View file @
39821a90
...
@@ -52,7 +52,7 @@ int main(int argc, char* argv[])
...
@@ -52,7 +52,7 @@ int main(int argc, char* argv[])
using
LeftPads
=
Sequence
<
0
,
0
>
;
using
LeftPads
=
Sequence
<
0
,
0
>
;
using
RightPads
=
Sequence
<
0
,
0
>
;
using
RightPads
=
Sequence
<
0
,
0
>
;
#elif
1
#elif
0
// 3x3, 71x71
// 3x3, 71x71
constexpr
index_t
N
=
128
;
constexpr
index_t
N
=
128
;
constexpr
index_t
C
=
192
;
constexpr
index_t
C
=
192
;
...
@@ -67,7 +67,7 @@ int main(int argc, char* argv[])
...
@@ -67,7 +67,7 @@ int main(int argc, char* argv[])
using
LeftPads
=
Sequence
<
1
,
1
>
;
using
LeftPads
=
Sequence
<
1
,
1
>
;
using
RightPads
=
Sequence
<
1
,
1
>
;
using
RightPads
=
Sequence
<
1
,
1
>
;
#elif
0
#elif
1
// 1x1, 8x8
// 1x1, 8x8
constexpr
index_t
N
=
128
;
constexpr
index_t
N
=
128
;
constexpr
index_t
C
=
1536
;
constexpr
index_t
C
=
1536
;
...
@@ -592,7 +592,7 @@ int main(int argc, char* argv[])
...
@@ -592,7 +592,7 @@ int main(int argc, char* argv[])
LeftPads{},
LeftPads{},
RightPads{},
RightPads{},
nrepeat);
nrepeat);
#elif
0
#elif
1
device_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw
(
in_nchw_desc
,
device_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw
(
in_nchw_desc
,
in_nchw
,
in_nchw
,
wei_kcyx_desc
,
wei_kcyx_desc
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment