Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
5fdccfce
Commit
5fdccfce
authored
Jan 20, 2021
by
Chao Liu
Browse files
added vector load, but ISA is much worse
parent
44ddcdcb
Changes
5
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
175 additions
and
121 deletions
+175
-121
composable_kernel/include/tensor_operation/threadwise_dynamic_tensor_slice_transfer.hpp
...or_operation/threadwise_dynamic_tensor_slice_transfer.hpp
+131
-70
composable_kernel/include/utility/amd_buffer_addressing.hpp
composable_kernel/include/utility/amd_buffer_addressing.hpp
+10
-0
composable_kernel/include/utility/float_type.amd.hpp.in
composable_kernel/include/utility/float_type.amd.hpp.in
+31
-48
driver/include/device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
...convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
+2
-2
driver/src/conv_driver.cpp
driver/src/conv_driver.cpp
+1
-1
No files found.
composable_kernel/include/tensor_operation/threadwise_dynamic_tensor_slice_transfer.hpp
View file @
5fdccfce
...
@@ -784,27 +784,30 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
...
@@ -784,27 +784,30 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
}();
}();
// TODO use constexpr for coordinate-step to make sure compiler behave correctly
// TODO use constexpr for coordinate-step to make sure compiler behave correctly
const
auto
src_step_0_p1
=
const
auto
src_step_0_p1
=
make_dynamic_tensor_coordinate_step
(
make_dynamic_tensor_coordinate_step
(
src_desc
,
make_multi_index
(
0
,
1
));
src_desc
,
make_multi_index
(
0
,
1
)
*
src_scalar_per_access
);
const
auto
src_step_0_m1
=
make_dynamic_tensor_coordinate_step
(
src_desc
,
make_multi_index
(
0
,
-
1
));
const
auto
src_step_p1_0
=
const
auto
src_step_0_m1
=
make_dynamic_tensor_coordinate_step
(
make_dynamic_tensor_coordinate_step
(
src_desc
,
make_multi_index
(
1
,
0
));
src_desc
,
make_multi_index
(
0
,
-
1
)
*
src_scalar_per_access
);
const
auto
src_step_m1_0
=
make_dynamic_tensor_coordinate_step
(
src_desc
,
make_multi_index
(
-
1
,
0
));
const
auto
src_step_p1_0
=
make_dynamic_tensor_coordinate_step
(
src_desc
,
make_multi_index
(
1
,
0
)
*
src_scalar_per_access
);
const
auto
src_step_m1_0
=
make_dynamic_tensor_coordinate_step
(
src_desc
,
make_multi_index
(
-
1
,
0
)
*
src_scalar_per_access
);
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
static_for
<
0
,
access_l
engths
[
I0
],
1
>
{}([
&
](
auto
iter0
)
{
static_for
<
0
,
SliceL
engths
{}
[
I0
],
src_scalar_per_access
[
I0
]
>
{}([
&
](
auto
iter0
)
{
static_for
<
0
,
access_l
engths
[
I1
],
1
>
{}([
&
](
auto
iter1
)
{
static_for
<
0
,
SliceL
engths
{}
[
I1
],
src_scalar_per_access
[
I1
]
>
{}([
&
](
auto
iter1
)
{
// step direction
// step direction
constexpr
bool
forward_dim1
=
(
iter0
.
value
%
2
==
0
);
constexpr
bool
forward_dim1
=
(
iter0
.
value
%
(
2
*
src_scalar_per_access
[
I0
])
==
0
);
constexpr
index_t
i0
=
iter0
.
value
;
constexpr
index_t
i0
=
iter0
.
value
;
constexpr
index_t
i1
=
constexpr
index_t
i1
=
forward_dim1
?
iter1
.
value
:
access_lengths
[
I1
]
-
iter1
.
value
-
1
;
forward_dim1
?
iter1
.
value
:
SliceLengths
{}[
I1
]
-
src_scalar_per_access
[
I1
]
-
iter1
.
value
;
// do work
// do work
// hardcoding for buffer_load
// hardcoding for buffer_load
...
@@ -812,7 +815,7 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
...
@@ -812,7 +815,7 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
static_assert
(
SrcAddressSpace
==
AddressSpace
::
Global
,
static_assert
(
SrcAddressSpace
==
AddressSpace
::
Global
,
"wrong! hardcoded to use buffer_load, src must be global mem"
);
"wrong! hardcoded to use buffer_load, src must be global mem"
);
#if
1
// only works for SrcScalarPerVector == 1
#if
0
// only works for SrcScalarPerVector == 1
auto src_data = amd_buffer_load<SrcData, 1>(
auto src_data = amd_buffer_load<SrcData, 1>(
p_src, src_slice_origin_.GetOffset(), true, src_desc.GetElementSpaceSize());
p_src, src_slice_origin_.GetOffset(), true, src_desc.GetElementSpaceSize());
...
@@ -822,17 +825,6 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
...
@@ -822,17 +825,6 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
constexpr index_t buffer_offset =
constexpr index_t buffer_offset =
buffer_desc_.CalculateOffset(make_multi_index(i0, i1));
buffer_desc_.CalculateOffset(make_multi_index(i0, i1));
buffer_
(
Number
<
buffer_offset
>
{})
=
is_valid
?
src_data
:
SrcData
{
0
};
#elif 1 // only works for SrcScalarPerVector == 1
auto
src_data
=
amd_buffer_load
<
SrcData
,
1
>
(
p_src
,
src_slice_origin_
.
GetOffset
(),
true
,
src_desc
.
GetElementSpaceSize
());
const
bool
is_valid
=
coordinate_has_valid_offset_assuming_visible_index_is_valid
(
src_desc
,
src_slice_origin_
);
constexpr
index_t
buffer_offset
=
buffer_desc_
.
CalculateOffset
(
make_multi_index
(
i0
,
i1
)
*
src_scalar_per_access
);
buffer_(Number<buffer_offset>{}) = is_valid ? src_data : SrcData{0};
buffer_(Number<buffer_offset>{}) = is_valid ? src_data : SrcData{0};
#else
#else
vector_type
<
SrcData
,
SrcScalarPerVector
>
src_vector
;
vector_type
<
SrcData
,
SrcScalarPerVector
>
src_vector
;
...
@@ -849,10 +841,8 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
...
@@ -849,10 +841,8 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
static_for
<
0
,
SrcScalarPerVector
,
1
>
{}([
&
](
auto
i
)
{
static_for
<
0
,
SrcScalarPerVector
,
1
>
{}([
&
](
auto
i
)
{
constexpr
index_t
buffer_offset
=
buffer_desc_
.
CalculateOffset
(
constexpr
index_t
buffer_offset
=
buffer_desc_
.
CalculateOffset
(
make_multi_index
(
i0
,
i1
)
*
src_scalar_per_access
+
make_multi_index
(
i0
,
i1
)
+
i
*
src_scalar_step_in_vector
);
i
*
src_scalar_step_in_vector
);
// TODO: can buffe_ use vector access?
buffer_
(
Number
<
buffer_offset
>
{})
=
src_vector
[
i
];
buffer_
(
Number
<
buffer_offset
>
{})
=
src_vector
[
i
];
});
});
#endif
#endif
...
@@ -882,7 +872,7 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
...
@@ -882,7 +872,7 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
if
constexpr
(
SrcResetCoordinateAfterRun
)
if
constexpr
(
SrcResetCoordinateAfterRun
)
{
{
const
auto
src_back_step
=
const
auto
src_back_step
=
make_dynamic_tensor_coordinate_step
(
src_desc
,
GetCoordinateBackStep
());
make_dynamic_tensor_coordinate_step
(
src_desc
,
Get
Src
CoordinateBackStep
());
move_dynamic_tensor_coordinate
(
src_desc
,
src_slice_origin_
,
src_back_step
);
move_dynamic_tensor_coordinate
(
src_desc
,
src_slice_origin_
,
src_back_step
);
}
}
...
@@ -959,7 +949,7 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
...
@@ -959,7 +949,7 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
if
constexpr
(
DstResetCoordinateAfterRun
)
if
constexpr
(
DstResetCoordinateAfterRun
)
{
{
const
auto
dst_back_step
=
const
auto
dst_back_step
=
make_dynamic_tensor_coordinate_step
(
dst_desc
,
GetCoordinateBackStep
());
make_dynamic_tensor_coordinate_step
(
dst_desc
,
Get
Dst
CoordinateBackStep
());
move_dynamic_tensor_coordinate
(
dst_desc
,
dst_slice_origin_
,
dst_back_step
);
move_dynamic_tensor_coordinate
(
dst_desc
,
dst_slice_origin_
,
dst_back_step
);
}
}
...
@@ -967,8 +957,8 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
...
@@ -967,8 +957,8 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
__device__
void
RunRead_hack
(
const
SrcDesc
&
src_desc
,
const
SrcData
*
p_src
)
__device__
void
RunRead_hack
(
const
SrcDesc
&
src_desc
,
const
SrcData
*
p_src
)
{
{
// hardcod
ing
for
buffer_load
// hardcod
ed
for
2D
// TODO
refactor transfer_data() to encapsulate this
// TODO
implemente N-D
static_assert
(
remove_reference_t
<
SrcDesc
>::
GetNumOfDimension
()
==
2
,
static_assert
(
remove_reference_t
<
SrcDesc
>::
GetNumOfDimension
()
==
2
,
"wrong! hardcoded for 2D tensor"
);
"wrong! hardcoded for 2D tensor"
);
...
@@ -1017,50 +1007,71 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
...
@@ -1017,50 +1007,71 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
#if 0 // hack
#if 0 // hack
// TODO use constexpr for coordinate-step to make sure compiler behave correctly
// TODO use constexpr for coordinate-step to make sure compiler behave correctly
const auto src_step_0_p1 =
const auto src_step_0_p1 = make_dynamic_tensor_coordinate_step(
make_dynamic_tensor_coordinate_step(src_desc, make_multi_index(0, 1));
src_desc, make_multi_index(0, 1) * src_scalar_per_access);
const auto src_step_0_m1 =
make_dynamic_tensor_coordinate_step(src_desc, make_multi_index(0, -1));
const auto src_step_p1_0 =
const auto src_step_0_m1 = make_dynamic_tensor_coordinate_step(
make_dynamic_tensor_coordinate_step(src_desc, make_multi_index(1, 0));
src_desc, make_multi_index(0, -1) * src_scalar_per_access);
const auto src_step_m1_0 =
make_dynamic_tensor_coordinate_step(src_desc, make_multi_index(-1, 0));
const auto src_step_p1_0 = make_dynamic_tensor_coordinate_step(
src_desc, make_multi_index(1, 0) * src_scalar_per_access);
const auto src_step_m1_0 = make_dynamic_tensor_coordinate_step(
src_desc, make_multi_index(-1, 0) * src_scalar_per_access);
#elif
0
#elif
0
// for padded input tensor
// for padded input tensor
const
auto
src_step_0_p1
=
make_dynamic_tensor_coordinate_step_hack
(
const
auto
src_step_0_p1
=
src_desc
,
make_multi_index
(
0
,
1
),
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
1
>
{});
make_dynamic_tensor_coordinate_step_hack
(
src_desc
,
make_multi_index
(
0
,
1
)
*
src_scalar_per_access
,
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
1
>
{});
const
auto
src_step_0_m1
=
make_dynamic_tensor_coordinate_step_hack
(
const
auto
src_step_0_m1
=
make_dynamic_tensor_coordinate_step_hack
(
src_desc
,
make_multi_index
(
0
,
-
1
),
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
2
>
{});
src_desc
,
make_multi_index
(
0
,
-
1
)
*
src_scalar_per_access
,
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
2
>
{});
const
auto
src_step_p1_0
=
make_dynamic_tensor_coordinate_step_hack
(
const
auto
src_step_p1_0
=
src_desc
,
make_multi_index
(
1
,
0
),
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
1
,
0
>
{});
make_dynamic_tensor_coordinate_step_hack
(
src_desc
,
make_multi_index
(
1
,
0
)
*
src_scalar_per_access
,
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
1
,
0
>
{});
const
auto
src_step_m1_0
=
make_dynamic_tensor_coordinate_step_hack
(
const
auto
src_step_m1_0
=
make_dynamic_tensor_coordinate_step_hack
(
src_desc
,
make_multi_index
(
-
1
,
0
),
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
2
,
0
>
{});
src_desc
,
make_multi_index
(
-
1
,
0
)
*
src_scalar_per_access
,
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
2
,
0
>
{});
#elif 1
#elif 1
// for non-padded input tensor
// for non-padded input tensor
const
auto
src_step_0_p1
=
make_dynamic_tensor_coordinate_step_hack
(
const
auto
src_step_0_p1
=
src_desc
,
make_multi_index
(
0
,
1
),
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
1
>
{});
make_dynamic_tensor_coordinate_step_hack
(
src_desc
,
make_multi_index
(
0
,
1
)
*
src_scalar_per_access
,
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
1
>
{});
const
auto
src_step_0_m1
=
make_dynamic_tensor_coordinate_step_hack
(
const
auto
src_step_0_m1
=
make_dynamic_tensor_coordinate_step_hack
(
src_desc
,
make_multi_index
(
0
,
-
1
),
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
2
>
{});
src_desc
,
make_multi_index
(
0
,
-
1
)
*
src_scalar_per_access
,
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
2
>
{});
const
auto
src_step_p1_0
=
make_dynamic_tensor_coordinate_step_hack
(
src_desc
,
make_multi_index
(
1
,
0
)
*
src_scalar_per_access
,
Sequence
<
0
,
0
,
0
,
0
,
0
,
1
,
0
>
{});
const
auto
src_step_p1_0
=
make_dynamic_tensor_coordinate_step_hack
(
src_desc
,
make_multi_index
(
1
,
0
),
Sequence
<
0
,
0
,
0
,
0
,
0
,
1
,
0
>
{});
const
auto
src_step_m1_0
=
make_dynamic_tensor_coordinate_step_hack
(
const
auto
src_step_m1_0
=
make_dynamic_tensor_coordinate_step_hack
(
src_desc
,
make_multi_index
(
-
1
,
0
),
Sequence
<
0
,
0
,
0
,
0
,
0
,
2
,
0
>
{});
src_desc
,
make_multi_index
(
-
1
,
0
)
*
src_scalar_per_access
,
Sequence
<
0
,
0
,
0
,
0
,
0
,
2
,
0
>
{});
#endif
#endif
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
static_for
<
0
,
access_l
engths
[
I0
],
1
>
{}([
&
](
auto
iter0
)
{
static_for
<
0
,
SliceL
engths
{}
[
I0
],
src_scalar_per_access
[
I0
]
>
{}([
&
](
auto
iter0
)
{
static_for
<
0
,
access_l
engths
[
I1
],
1
>
{}([
&
](
auto
iter1
)
{
static_for
<
0
,
SliceL
engths
{}
[
I1
],
src_scalar_per_access
[
I1
]
>
{}([
&
](
auto
iter1
)
{
// step direction
// step direction
constexpr
bool
forward_dim1
=
(
iter0
.
value
%
2
==
0
);
constexpr
bool
forward_dim1
=
(
iter0
.
value
%
(
2
*
src_scalar_per_access
[
I0
])
==
0
);
constexpr
index_t
i0
=
iter0
.
value
;
constexpr
index_t
i0
=
iter0
.
value
;
constexpr
index_t
i1
=
constexpr
index_t
i1
=
forward_dim1
?
iter1
.
value
:
access_lengths
[
I1
]
-
iter1
.
value
-
1
;
forward_dim1
?
iter1
.
value
:
SliceLengths
{}[
I1
]
-
src_scalar_per_access
[
I1
]
-
iter1
.
value
;
// do work
// do work
// hardcoding for buffer_load
// hardcoding for buffer_load
...
@@ -1068,7 +1079,7 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
...
@@ -1068,7 +1079,7 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
static_assert
(
SrcAddressSpace
==
AddressSpace
::
Global
,
static_assert
(
SrcAddressSpace
==
AddressSpace
::
Global
,
"wrong! hardcoded to use buffer_load, src must be global mem"
);
"wrong! hardcoded to use buffer_load, src must be global mem"
);
#if
1
// only works for SrcScalarPerVector == 1
#if
0
// only works for SrcScalarPerVector == 1
auto src_data = amd_buffer_load<SrcData, 1>(
auto src_data = amd_buffer_load<SrcData, 1>(
p_src, src_slice_origin_.GetOffset(), true, src_desc.GetElementSpaceSize());
p_src, src_slice_origin_.GetOffset(), true, src_desc.GetElementSpaceSize());
...
@@ -1076,10 +1087,10 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
...
@@ -1076,10 +1087,10 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
src_desc, src_slice_origin_);
src_desc, src_slice_origin_);
constexpr index_t buffer_offset =
constexpr index_t buffer_offset =
buffer_desc_
.
CalculateOffset
(
make_multi_index
(
i0
,
i1
)
*
src_scalar_per_access
);
buffer_desc_.CalculateOffset(make_multi_index(i0, i1));
buffer_(Number<buffer_offset>{}) = is_valid ? src_data : SrcData{0};
buffer_(Number<buffer_offset>{}) = is_valid ? src_data : SrcData{0};
#el
if 1
#el
se
vector_type
<
SrcData
,
SrcScalarPerVector
>
src_vector
;
vector_type
<
SrcData
,
SrcScalarPerVector
>
src_vector
;
using
SrcVectorType
=
typename
vector_type
<
SrcData
,
SrcScalarPerVector
>::
MemoryType
;
using
SrcVectorType
=
typename
vector_type
<
SrcData
,
SrcScalarPerVector
>::
MemoryType
;
...
@@ -1094,10 +1105,8 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
...
@@ -1094,10 +1105,8 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
static_for
<
0
,
SrcScalarPerVector
,
1
>
{}([
&
](
auto
i
)
{
static_for
<
0
,
SrcScalarPerVector
,
1
>
{}([
&
](
auto
i
)
{
constexpr
index_t
buffer_offset
=
buffer_desc_
.
CalculateOffset
(
constexpr
index_t
buffer_offset
=
buffer_desc_
.
CalculateOffset
(
make_multi_index
(
i0
,
i1
)
*
src_scalar_per_access
+
make_multi_index
(
i0
,
i1
)
+
i
*
src_scalar_step_in_vector
);
i
*
src_scalar_step_in_vector
);
// TODO: can buffe_ use vector access?
buffer_
(
Number
<
buffer_offset
>
{})
=
src_vector
[
i
];
buffer_
(
Number
<
buffer_offset
>
{})
=
src_vector
[
i
];
});
});
#endif
#endif
...
@@ -1127,25 +1136,77 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
...
@@ -1127,25 +1136,77 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
if
constexpr
(
SrcResetCoordinateAfterRun
)
if
constexpr
(
SrcResetCoordinateAfterRun
)
{
{
const
auto
src_back_step
=
const
auto
src_back_step
=
make_dynamic_tensor_coordinate_step
(
src_desc
,
GetCoordinateBackStep
());
make_dynamic_tensor_coordinate_step
(
src_desc
,
Get
Src
CoordinateBackStep
());
move_dynamic_tensor_coordinate
(
src_desc
,
src_slice_origin_
,
src_back_step
);
move_dynamic_tensor_coordinate
(
src_desc
,
src_slice_origin_
,
src_back_step
);
}
}
}
}
__device__
static
constexpr
auto
GetCoordinateBackStep
()
__device__
static
constexpr
auto
Get
Src
CoordinateBackStep
()
{
{
MultiIndex
<
nDim
>
back_step
;
constexpr
auto
src_scalar_per_access
=
[
&
]()
{
Index
src_scalar_per_access
;
back_step
(
Number
<
0
>
{})
=
1
-
SliceLengths
{}[
0
];
static_for
<
0
,
nDim
,
1
>
{}([
&
](
auto
i
)
{
if
constexpr
(
i
==
SrcVectorDim
)
{
src_scalar_per_access
(
i
)
=
SrcScalarPerVector
*
SrcScalarStrideInVector
;
}
else
{
src_scalar_per_access
(
i
)
=
1
;
}
});
return
src_scalar_per_access
;
}();
MultiIndex
<
nDim
>
src_back_step
;
src_back_step
(
Number
<
0
>
{})
=
src_scalar_per_access
[
Number
<
0
>
{}]
-
SliceLengths
{}[
0
];
static_for
<
1
,
nDim
,
1
>
{}([
&
](
auto
i
)
{
static_for
<
1
,
nDim
,
1
>
{}([
&
](
auto
i
)
{
back_step
(
i
)
=
(
SliceLengths
{}[
i
-
Number
<
1
>
{}]
%
2
==
0
)
?
0
:
(
1
-
SliceLengths
{}[
i
]);
constexpr
auto
i_m1
=
i
-
Number
<
1
>
{};
src_back_step
(
i
)
=
(
SliceLengths
{}[
i_m1
]
%
(
2
*
src_scalar_per_access
[
i_m1
])
==
0
)
?
0
:
(
src_scalar_per_access
[
i
]
-
SliceLengths
{}[
i
]);
});
});
return
back_step
;
return
src_
back_step
;
}
}
__device__
static
constexpr
auto
GetDstCoordinateBackStep
()
{
constexpr
auto
dst_scalar_per_access
=
[
&
]()
{
Index
dst_scalar_per_access
;
static_for
<
0
,
nDim
,
1
>
{}([
&
](
auto
i
)
{
if
constexpr
(
i
==
DstVectorDim
)
{
dst_scalar_per_access
(
i
)
=
DstScalarPerVector
*
DstScalarStrideInVector
;
}
else
{
dst_scalar_per_access
(
i
)
=
1
;
}
});
return
dst_scalar_per_access
;
}();
MultiIndex
<
nDim
>
dst_back_step
;
dst_back_step
(
Number
<
0
>
{})
=
dst_scalar_per_access
[
Number
<
0
>
{}]
-
SliceLengths
{}[
0
];
static_for
<
1
,
nDim
,
1
>
{}([
&
](
auto
i
)
{
constexpr
auto
i_m1
=
i
-
Number
<
1
>
{};
dst_back_step
(
i
)
=
(
SliceLengths
{}[
i_m1
]
%
(
2
*
dst_scalar_per_access
[
i_m1
])
==
0
)
?
0
:
(
dst_scalar_per_access
[
i
]
-
SliceLengths
{}[
i
]);
});
return
dst_back_step
;
}
// src_slice_origin_step_idx need to be known at compile-time, for performance reason
// src_slice_origin_step_idx need to be known at compile-time, for performance reason
__device__
void
MoveSrcSliceWindow
(
const
SrcDesc
&
src_desc
,
__device__
void
MoveSrcSliceWindow
(
const
SrcDesc
&
src_desc
,
const
Index
&
src_slice_origin_step_idx
)
const
Index
&
src_slice_origin_step_idx
)
...
@@ -1153,7 +1214,7 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
...
@@ -1153,7 +1214,7 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
// if src coord was not reset by RunRead(), then need to adjust the step here
// if src coord was not reset by RunRead(), then need to adjust the step here
const
auto
adjusted_step_idx
=
SrcResetCoordinateAfterRun
const
auto
adjusted_step_idx
=
SrcResetCoordinateAfterRun
?
src_slice_origin_step_idx
?
src_slice_origin_step_idx
:
src_slice_origin_step_idx
+
GetCoordinateBackStep
();
:
src_slice_origin_step_idx
+
Get
Src
CoordinateBackStep
();
// is it OK to construct a new step every time?
// is it OK to construct a new step every time?
const
auto
adjusted_step
=
make_dynamic_tensor_coordinate_step
(
src_desc
,
adjusted_step_idx
);
const
auto
adjusted_step
=
make_dynamic_tensor_coordinate_step
(
src_desc
,
adjusted_step_idx
);
...
@@ -1168,7 +1229,7 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
...
@@ -1168,7 +1229,7 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
// if dst coord was not reset by RunWrite(), then need to adjust the step here
// if dst coord was not reset by RunWrite(), then need to adjust the step here
const
auto
adjusted_step_idx
=
DstResetCoordinateAfterRun
const
auto
adjusted_step_idx
=
DstResetCoordinateAfterRun
?
dst_slice_origin_step_idx
?
dst_slice_origin_step_idx
:
dst_slice_origin_step_idx
+
GetCoordinateBackStep
();
:
dst_slice_origin_step_idx
+
Get
Dst
CoordinateBackStep
();
// is it OK to construct a new step every time?
// is it OK to construct a new step every time?
const
auto
adjusted_step
=
make_dynamic_tensor_coordinate_step
(
dst_desc
,
adjusted_step_idx
);
const
auto
adjusted_step
=
make_dynamic_tensor_coordinate_step
(
dst_desc
,
adjusted_step_idx
);
...
@@ -1183,7 +1244,7 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
...
@@ -1183,7 +1244,7 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
// if src coord was not reset by RunRead(), then need to adjust the step here
// if src coord was not reset by RunRead(), then need to adjust the step here
const
auto
adjusted_step_idx
=
SrcResetCoordinateAfterRun
const
auto
adjusted_step_idx
=
SrcResetCoordinateAfterRun
?
src_slice_origin_step_idx
?
src_slice_origin_step_idx
:
src_slice_origin_step_idx
+
GetCoordinateBackStep
();
:
src_slice_origin_step_idx
+
Get
Src
CoordinateBackStep
();
// is it OK to construct a new step every time?
// is it OK to construct a new step every time?
#if 0 // hack
#if 0 // hack
...
...
composable_kernel/include/utility/amd_buffer_addressing.hpp
View file @
5fdccfce
...
@@ -187,10 +187,15 @@ __device__ float2_t amd_buffer_load<float, 2>(const float* p_src_wave,
...
@@ -187,10 +187,15 @@ __device__ float2_t amd_buffer_load<float, 2>(const float* p_src_wave,
return
__llvm_amdgcn_buffer_load_f32x2
(
return
__llvm_amdgcn_buffer_load_f32x2
(
src_wave_buffer_resource
.
data
,
0
,
src_addr_shift
+
src_thread_addr_offset
,
false
,
false
);
src_wave_buffer_resource
.
data
,
0
,
src_addr_shift
+
src_thread_addr_offset
,
false
,
false
);
#else
#else
#if 0
float2_t tmp = __llvm_amdgcn_buffer_load_f32x2(
float2_t tmp = __llvm_amdgcn_buffer_load_f32x2(
src_wave_buffer_resource.data, 0, src_thread_addr_offset, false, false);
src_wave_buffer_resource.data, 0, src_thread_addr_offset, false, false);
return src_thread_data_valid ? tmp : float2_t(0);
return src_thread_data_valid ? tmp : float2_t(0);
#else
return
__llvm_amdgcn_buffer_load_f32x2
(
src_wave_buffer_resource
.
data
,
0
,
src_thread_addr_offset
,
false
,
false
);
#endif
#endif
#endif
}
}
...
@@ -217,10 +222,15 @@ __device__ float4_t amd_buffer_load<float, 4>(const float* p_src_wave,
...
@@ -217,10 +222,15 @@ __device__ float4_t amd_buffer_load<float, 4>(const float* p_src_wave,
return
__llvm_amdgcn_buffer_load_f32x4
(
return
__llvm_amdgcn_buffer_load_f32x4
(
src_wave_buffer_resource
.
data
,
0
,
src_addr_shift
+
src_thread_addr_offset
,
false
,
false
);
src_wave_buffer_resource
.
data
,
0
,
src_addr_shift
+
src_thread_addr_offset
,
false
,
false
);
#else
#else
#if 0
float4_t tmp = __llvm_amdgcn_buffer_load_f32x4(
float4_t tmp = __llvm_amdgcn_buffer_load_f32x4(
src_wave_buffer_resource.data, 0, src_thread_addr_offset, false, false);
src_wave_buffer_resource.data, 0, src_thread_addr_offset, false, false);
return src_thread_data_valid ? tmp : float4_t(0);
return src_thread_data_valid ? tmp : float4_t(0);
#else
return
__llvm_amdgcn_buffer_load_f32x4
(
src_wave_buffer_resource
.
data
,
0
,
src_thread_addr_offset
,
false
,
false
);
#endif
#endif
#endif
}
}
...
...
composable_kernel/include/utility/float_type.amd.hpp.in
View file @
5fdccfce
...
@@ -184,6 +184,8 @@ struct vector_type<float, 1>
...
@@ -184,6 +184,8 @@ struct vector_type<float, 1>
float data_;
float data_;
__host__ __device__ explicit constexpr vector_type() : data_{0} {}
__host__ __device__ static constexpr index_t Size() { return 1; }
__host__ __device__ static constexpr index_t Size() { return 1; }
__host__ __device__ constexpr const auto& Vector() const { return data_; }
__host__ __device__ constexpr const auto& Vector() const { return data_; }
...
@@ -212,25 +214,34 @@ struct vector_type<float, 2>
...
@@ -212,25 +214,34 @@ struct vector_type<float, 2>
{
{
using MemoryType = float2_t;
using MemoryType = float2_t;
union
DataType
union
{
{
MemoryType vector;
float2_t vector_;
float scalar[2];
StaticallyIndexedArray<float, 2> scalars_;
};
} data_;
__host__ __device__ explicit constexpr vector_type() : data_{MemoryType{0}} {}
__host__ __device__ static constexpr index_t Size() { return 2; }
__host__ __device__ constexpr const auto& Vector() const { return data_.vector_; }
__host__ __device__ constexpr auto& Vector() { return data_.vector_; }
template <index_t I>
template <index_t I>
__host__ __device__
static void SetScalar(MemoryType& v, float s,
Number<I>)
__host__ __device__
constexpr const auto& operator[](
Number<I>)
const
{
{
static_assert(I < 2, "wrong");
static_assert(I >= 0 && I < 2, "wrong!");
*(reinterpret_cast<float*>(&v) + I) = s;
return data_.scalars_[Number<I>{}];
}
}
__host__ __device__ static MemoryType Pack(float s0, float s1)
template <index_t I>
__host__ __device__ constexpr auto& operator()(Number<I>)
{
{
DataType data;
static_assert(I >= 0 && I < 2, "wrong!");
data.scalar[0] = s0;
data.scalar[1] = s1;
return data_.scalars_(Number<I>{});
return data.vector;
}
}
};
};
...
@@ -241,37 +252,24 @@ struct vector_type<float, 4>
...
@@ -241,37 +252,24 @@ struct vector_type<float, 4>
union
union
{
{
float4_t v;
float4_t v
ector_
;
float s0, s1, s2, s3
;
StaticallyIndexedArray<float, 4> scalars_
;
} data_;
} data_;
__host__ __device__ explicit constexpr vector_type() : data_{MemoryType{0}} {}
__host__ __device__ static constexpr index_t Size() { return 4; }
__host__ __device__ static constexpr index_t Size() { return 4; }
__host__ __device__ constexpr const auto& Vector() const { return data_.v; }
__host__ __device__ constexpr const auto& Vector() const { return data_.v
ector_
; }
__host__ __device__ constexpr auto& Vector() { return data_.v; }
__host__ __device__ constexpr auto& Vector() { return data_.v
ector_
; }
template <index_t I>
template <index_t I>
__host__ __device__ constexpr const auto& operator[](Number<I>) const
__host__ __device__ constexpr const auto& operator[](Number<I>) const
{
{
static_assert(I >= 0 && I < 4, "wrong!");
static_assert(I >= 0 && I < 4, "wrong!");
if constexpr(I == 0)
return data_.scalars_[Number<I>{}];
{
return data_.s0;
}
else if constexpr(I == 1)
{
return data_.s1;
}
else if constexpr(I == 2)
{
return data_.s2;
}
else
{
return data_.s3;
}
}
}
template <index_t I>
template <index_t I>
...
@@ -279,22 +277,7 @@ struct vector_type<float, 4>
...
@@ -279,22 +277,7 @@ struct vector_type<float, 4>
{
{
static_assert(I >= 0 && I < 4, "wrong!");
static_assert(I >= 0 && I < 4, "wrong!");
if constexpr(I == 0)
return data_.scalars_(Number<I>{});
{
return data_.s0;
}
else if constexpr(I == 1)
{
return data_.s1;
}
else if constexpr(I == 2)
{
return data_.s2;
}
else
{
return data_.s3;
}
}
}
};
};
...
...
driver/include/device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
View file @
5fdccfce
...
@@ -135,7 +135,7 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc
...
@@ -135,7 +135,7 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc
using
GemmABlockTransferThreadSliceLengths_GemmK_GemmM
=
Sequence
<
4
,
1
>
;
using
GemmABlockTransferThreadSliceLengths_GemmK_GemmM
=
Sequence
<
4
,
1
>
;
using
GemmABlockTransferThreadClusterLengths_GemmK_GemmM
=
Sequence
<
2
,
128
>
;
using
GemmABlockTransferThreadClusterLengths_GemmK_GemmM
=
Sequence
<
2
,
128
>
;
constexpr
index_t
GemmABlockTransferSrcScalarPerVector_GemmK
=
1
;
constexpr
index_t
GemmABlockTransferSrcScalarPerVector_GemmK
=
4
;
constexpr
index_t
GemmABlockTransferDstScalarPerVector_GemmM
=
1
;
constexpr
index_t
GemmABlockTransferDstScalarPerVector_GemmM
=
1
;
using
GemmBBlockTransferThreadSliceLengths_GemmK_GemmN
=
Sequence
<
4
,
1
>
;
using
GemmBBlockTransferThreadSliceLengths_GemmK_GemmN
=
Sequence
<
4
,
1
>
;
...
@@ -145,7 +145,7 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc
...
@@ -145,7 +145,7 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc
constexpr
index_t
GemmBBlockTransferDstScalarPerVector_GemmN
=
1
;
constexpr
index_t
GemmBBlockTransferDstScalarPerVector_GemmN
=
1
;
constexpr
index_t
GemmCThreadTransferDstScalarPerVector_GemmN1
=
1
;
constexpr
index_t
GemmCThreadTransferDstScalarPerVector_GemmN1
=
1
;
#elif
1
#elif
0
// cdata = 64, BlockSize = 256, 128x128x8
// cdata = 64, BlockSize = 256, 128x128x8
// b thread copy 2x2
// b thread copy 2x2
constexpr
index_t
BlockSize
=
256
;
constexpr
index_t
BlockSize
=
256
;
...
...
driver/src/conv_driver.cpp
View file @
5fdccfce
...
@@ -550,7 +550,7 @@ int main(int argc, char* argv[])
...
@@ -550,7 +550,7 @@ int main(int argc, char* argv[])
in_nchw
.
GenerateTensorValue
(
GeneratorTensor_1
{},
num_thread
);
in_nchw
.
GenerateTensorValue
(
GeneratorTensor_1
{},
num_thread
);
wei_kcyx
.
GenerateTensorValue
(
GeneratorTensor_3
{},
num_thread
);
wei_kcyx
.
GenerateTensorValue
(
GeneratorTensor_3
{},
num_thread
);
#elif 0
#elif 0
in_nchw
.
GenerateTensorValue
(
GeneratorTensor_
3
{
},
num_thread
);
in_nchw
.
GenerateTensorValue
(
GeneratorTensor_
2
{
-
5
,
5
},
num_thread
);
wei_kcyx
.
GenerateTensorValue
(
GeneratorTensor_1
{},
num_thread
);
wei_kcyx
.
GenerateTensorValue
(
GeneratorTensor_1
{},
num_thread
);
#elif 1
#elif 1
in_nchw
.
GenerateTensorValue
(
GeneratorTensor_2
{
-
5
,
5
},
num_thread
);
in_nchw
.
GenerateTensorValue
(
GeneratorTensor_2
{
-
5
,
5
},
num_thread
);
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment