Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
edc08fe6
Commit
edc08fe6
authored
Feb 25, 2021
by
Chao Liu
Browse files
static kernel use raw buffer load/store
parent
ecad4061
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
79 additions
and
46 deletions
+79
-46
composable_kernel/include/tensor_operation/threadwise_dynamic_tensor_slice_transfer.hpp
...or_operation/threadwise_dynamic_tensor_slice_transfer.hpp
+69
-36
composable_kernel/include/utility/in_memory_operation.amd.hpp.in
...ble_kernel/include/utility/in_memory_operation.amd.hpp.in
+7
-7
driver/include/device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
...convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
+1
-1
driver/src/conv_driver.cpp
driver/src/conv_driver.cpp
+2
-2
No files found.
composable_kernel/include/tensor_operation/threadwise_dynamic_tensor_slice_transfer.hpp
View file @
edc08fe6
...
@@ -173,14 +173,10 @@ struct ThreadwiseDynamicTensorSliceTransfer_v1r3
...
@@ -173,14 +173,10 @@ struct ThreadwiseDynamicTensorSliceTransfer_v1r3
}();
}();
// copy data
// copy data
// hardcoding for buffer_store
// TODO refactor transfer_data() to encapsulate this
static_assert
(
SrcAddressSpace
==
AddressSpace
::
Vgpr
&&
DstAddressSpace
==
AddressSpace
::
Global
,
"wrong! hardcoded to use buffer_store"
);
vector_type
<
DstData
,
DstScalarPerVector
>
dst_vector
;
vector_type
<
DstData
,
DstScalarPerVector
>
dst_vector
;
using
dst_vector_t
=
typename
vector_type
<
DstData
,
DstScalarPerVector
>::
MemoryType
;
static_for
<
0
,
DstScalarPerVector
,
1
>
{}([
&
](
auto
i
)
{
static_for
<
0
,
DstScalarPerVector
,
1
>
{}([
&
](
auto
i
)
{
constexpr
index_t
src_offset
=
constexpr
index_t
src_offset
=
src_desc
.
CalculateOffset
(
to_multi_index
(
src_slice_origin_idx
)
+
dst_data_idx
+
src_desc
.
CalculateOffset
(
to_multi_index
(
src_slice_origin_idx
)
+
dst_data_idx
+
...
@@ -189,13 +185,35 @@ struct ThreadwiseDynamicTensorSliceTransfer_v1r3
...
@@ -189,13 +185,35 @@ struct ThreadwiseDynamicTensorSliceTransfer_v1r3
dst_vector
.
Scalars
()(
i
)
=
p_src
[
Number
<
src_offset
>
{}];
dst_vector
.
Scalars
()(
i
)
=
p_src
[
Number
<
src_offset
>
{}];
});
});
amd_buffer_store_v2
<
DstData
,
DstScalarPerVector
>
(
const
bool
is_dst_valid
=
coordinate_has_valid_offset_assuming_visible_index_is_valid
(
dst_vector
.
Vector
(),
dst_desc
,
dst_slice_origin_coord_
);
p_dst
,
dst_slice_origin_coord_
.
GetOffset
(),
if
constexpr
(
SrcAddressSpace
==
AddressSpace
::
Vgpr
&&
coordinate_has_valid_offset_assuming_visible_index_is_valid
(
DstAddressSpace
==
AddressSpace
::
Global
)
dst_desc
,
dst_slice_origin_coord_
),
{
dst_desc
.
GetElementSpaceSize
());
#if CK_USE_AMD_BUFFER_ADDRESSING
amd_buffer_store_v2
<
DstData
,
DstScalarPerVector
>
(
dst_vector
.
Vector
(),
p_dst
,
dst_slice_origin_coord_
.
GetOffset
(),
is_dst_valid
,
dst_desc
.
GetElementSpaceSize
());
#else
if
(
is_dst_valid
)
{
*
reinterpret_cast
<
dst_vector_t
*>
(
&
(
p_dst
[
dst_slice_origin_coord_
.
GetOffset
]))
=
dst_vector
.
Vector
();
}
#endif
}
else
{
if
(
is_dst_valid
)
{
*
reinterpret_cast
<
dst_vector_t
*>
(
&
(
p_dst
[
dst_slice_origin_coord_
.
GetOffset
]))
=
dst_vector
.
Vector
();
}
}
constexpr
auto
move_on_dim
=
[
&
]()
constexpr
constexpr
auto
move_on_dim
=
[
&
]()
constexpr
{
{
...
@@ -482,33 +500,36 @@ struct ThreadwiseDynamicTensorSliceTransfer_v2
...
@@ -482,33 +500,36 @@ struct ThreadwiseDynamicTensorSliceTransfer_v2
}();
}();
// copy data
// copy data
// hardcoding for buffer_store
// TODO refactor transfer_data() to encapsulate this
static_assert
(
DstAddressSpace
==
AddressSpace
::
Vgpr
,
"wrong! hardcode for ds_read"
);
static_assert
(
DstAddressSpace
==
AddressSpace
::
Vgpr
,
"wrong! hardcode for ds_read"
);
vector_type
<
SrcData
,
SrcScalarPerVector
>
src_vector
;
vector_type
<
SrcData
,
SrcScalarPerVector
>
src_vector
;
using
src_vector_t
=
typename
vector_type
<
SrcData
,
SrcScalarPerVector
>::
MemoryType
;
using
src_vector_t
=
typename
vector_type
<
SrcData
,
SrcScalarPerVector
>::
MemoryType
;
const
bool
is_src_valid
=
coordinate_has_valid_offset_assuming_visible_index_is_valid
(
src_desc
,
src_slice_origin_coord_
);
if
constexpr
(
SrcAddressSpace
==
AddressSpace
::
Global
)
if
constexpr
(
SrcAddressSpace
==
AddressSpace
::
Global
)
{
{
const
bool
is_valid
=
coordinate_has_valid_offset_assuming_visible_index_is_valid
(
#if CK_USE_AMD_BUFFER_ADDRESSING
src_desc
,
src_slice_origin_coord_
);
src_vector
.
Vector
()
=
amd_buffer_load_v2
<
SrcData
,
SrcScalarPerVector
>
(
src_vector
.
Vector
()
=
amd_buffer_load_v2
<
SrcData
,
SrcScalarPerVector
>
(
p_src
,
p_src
,
src_slice_origin_coord_
.
GetOffset
(),
src_slice_origin_coord_
.
GetOffset
(),
is_valid
,
is_
src_
valid
,
src_desc
.
GetElementSpaceSize
());
src_desc
.
GetElementSpaceSize
());
#else
src_vector
.
Vector
()
=
is_src_valid
?
*
reinterpret_cast
<
const
src_vector_t
*>
(
&
p_src
[
src_slice_origin_coord_
.
GetOffset
()])
:
src_vector_t
{
0
};
#endif
}
}
else
else
{
{
const
bool
is_valid
=
coordinate_has_valid_offset_assuming_visible_index_is_valid
(
src_vector
.
Vector
()
=
is_src_valid
src_desc
,
src_slice_origin_coord_
);
?
*
reinterpret_cast
<
const
src_vector_t
*>
(
&
p_src
[
src_slice_origin_coord_
.
GetOffset
()])
src_vector
.
Vector
()
=
is_valid
?
*
reinterpret_cast
<
const
src_vector_t
*>
(
:
src_vector_t
{
0
};
&
p_src
[
src_slice_origin_coord_
.
GetOffset
()])
:
src_vector_t
{
0
};
}
}
static_for
<
0
,
SrcScalarPerVector
,
1
>
{}([
&
](
auto
i
)
{
static_for
<
0
,
SrcScalarPerVector
,
1
>
{}([
&
](
auto
i
)
{
...
@@ -815,23 +836,35 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
...
@@ -815,23 +836,35 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
}();
}();
// copy data
// copy data
// hardcoding for buffer_load
// TODO refactor transfer_data() to encapsulate this
static_assert
(
SrcAddressSpace
==
AddressSpace
::
Global
,
"wrong! hardcoded to use buffer_load, src must be global mem"
);
vector_type
<
SrcData
,
SrcScalarPerVector
>
src_vector
;
vector_type
<
SrcData
,
SrcScalarPerVector
>
src_vector
;
using
src_vector_t
=
typename
vector_type
<
SrcData
,
SrcScalarPerVector
>::
MemoryType
;
using
src_vector_t
=
typename
vector_type
<
SrcData
,
SrcScalarPerVector
>::
MemoryType
;
const
bool
is_valid
=
coordinate_has_valid_offset_assuming_visible_index_is_valid
(
const
bool
is_
src_
valid
=
coordinate_has_valid_offset_assuming_visible_index_is_valid
(
src_desc
,
src_slice_origin_coord_
);
src_desc
,
src_slice_origin_coord_
);
src_vector
.
Vector
()
=
if
constexpr
(
SrcAddressSpace
==
AddressSpace
::
Global
)
amd_buffer_load_v2
<
SrcData
,
SrcScalarPerVector
>
(
p_src
,
{
src_slice_origin_coord_
.
GetOffset
(),
#if CK_USE_AMD_BUFFER_ADDRESSING
is_valid
,
src_vector
.
Vector
()
=
amd_buffer_load_v2
<
SrcData
,
SrcScalarPerVector
>
(
src_desc
.
GetElementSpaceSize
());
p_src
,
src_slice_origin_coord_
.
GetOffset
(),
is_src_valid
,
src_desc
.
GetElementSpaceSize
());
#else
src_vector
.
Vector
()
=
is_src_valid
?
*
reinterpret_cast
<
const
src_vector_t
*>
(
&
p_src
[
src_slice_origin_coord_
.
GetOffset
()])
:
src_vector_t
{
0
};
#endif
}
else
{
src_vector
.
Vector
()
=
is_src_valid
?
*
reinterpret_cast
<
const
src_vector_t
*>
(
&
p_src
[
src_slice_origin_coord_
.
GetOffset
()])
:
src_vector_t
{
0
};
}
static_for
<
0
,
SrcScalarPerVector
,
1
>
{}([
&
](
auto
i
)
{
static_for
<
0
,
SrcScalarPerVector
,
1
>
{}([
&
](
auto
i
)
{
constexpr
index_t
buffer_offset
=
constexpr
index_t
buffer_offset
=
...
...
composable_kernel/include/utility/in_memory_operation.amd.hpp.in
View file @
edc08fe6
...
@@ -89,7 +89,7 @@ struct SetData
...
@@ -89,7 +89,7 @@ struct SetData
if(dst_valid)
if(dst_valid)
{
{
*reinterpret_cast<vector_t*>(&p_dst[dst_offset]) =
*reinterpret_cast<vector_t*>(&p_dst[dst_offset]) =
amd_buffer_load<T, DataPerAccess>(p_src, src_offset, src_valid, src_range);
amd_buffer_load
_v2
<T, DataPerAccess>(p_src, src_offset, src_valid, src_range);
}
}
}
}
...
@@ -109,12 +109,12 @@ struct SetData
...
@@ -109,12 +109,12 @@ struct SetData
{
{
const auto zeros = vector_t(0);
const auto zeros = vector_t(0);
amd_buffer_store<T, DataPerAccess>(
src_valid ? &(p_src[src_offset])
amd_buffer_store
_v2
<T, DataPerAccess>(
: reinterpret_cast<const T*>(&
zeros
)
,
src_valid ? *reinterpret_cast<const vector_t*>(&(p_src[src_offset])) :
zeros,
p_dst,
p_dst,
dst_offset,
dst_offset,
dst_valid,
dst_valid,
dst_range);
dst_range);
}
}
#endif
#endif
};
};
...
...
driver/include/device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
View file @
edc08fe6
...
@@ -67,7 +67,7 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc
...
@@ -67,7 +67,7 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc
const
auto
in_right_pads
=
sequence_to_tuple_of_number
(
InRightPads
{});
const
auto
in_right_pads
=
sequence_to_tuple_of_number
(
InRightPads
{});
#endif
#endif
#if
1
#if
0
// cdata = 16, BlockSize = 64, 16x64x4
// cdata = 16, BlockSize = 64, 16x64x4
constexpr index_t BlockSize = 64;
constexpr index_t BlockSize = 64;
...
...
driver/src/conv_driver.cpp
View file @
edc08fe6
...
@@ -674,7 +674,7 @@ int main(int argc, char* argv[])
...
@@ -674,7 +674,7 @@ int main(int argc, char* argv[])
LeftPads{},
LeftPads{},
RightPads{},
RightPads{},
nrepeat);
nrepeat);
#elif
1
#elif
0
device_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw
(
in_nchw_desc
,
device_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw
(
in_nchw_desc
,
in_nchw
,
in_nchw
,
wei_kcyx_desc
,
wei_kcyx_desc
,
...
@@ -686,7 +686,7 @@ int main(int argc, char* argv[])
...
@@ -686,7 +686,7 @@ int main(int argc, char* argv[])
LeftPads
{},
LeftPads
{},
RightPads
{},
RightPads
{},
nrepeat
);
nrepeat
);
#elif
0
#elif
1
device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw
(
in_nchw_desc
,
device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw
(
in_nchw_desc
,
in_nchw
,
in_nchw
,
wei_kcyx_desc
,
wei_kcyx_desc
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment