Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
9d5d6afa
Commit
9d5d6afa
authored
Apr 22, 2021
by
Chao Liu
Browse files
updating v5r1
parent
dcee43fe
Changes
6
Show whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
87 additions
and
24 deletions
+87
-24
composable_kernel/include/tensor_operation/threadwise_dynamic_tensor_slice_transfer.hpp
...or_operation/threadwise_dynamic_tensor_slice_transfer.hpp
+25
-17
composable_kernel/include/tensor_operation/threadwise_gemm_v3.hpp
...le_kernel/include/tensor_operation/threadwise_gemm_v3.hpp
+17
-0
composable_kernel/include/utility/amd_inline_asm.hpp
composable_kernel/include/utility/amd_inline_asm.hpp
+38
-0
composable_kernel/include/utility/config.amd.hpp.in
composable_kernel/include/utility/config.amd.hpp.in
+2
-2
driver/include/device_dynamic_convolution_forward_implicit_gemm_v4r4_nhwc_kyxc_nhwk.hpp
...convolution_forward_implicit_gemm_v4r4_nhwc_kyxc_nhwk.hpp
+2
-2
driver/src/conv_driver.cpp
driver/src/conv_driver.cpp
+3
-3
No files found.
composable_kernel/include/tensor_operation/threadwise_dynamic_tensor_slice_transfer.hpp
View file @
9d5d6afa
...
@@ -98,9 +98,14 @@ struct ThreadwiseDynamicTensorSliceTransfer_v1r3
...
@@ -98,9 +98,14 @@ struct ThreadwiseDynamicTensorSliceTransfer_v1r3
is_known_at_compile_time
<
remove_cv_t
<
remove_reference_t
<
SrcSliceOriginIdx
>>>::
value
,
is_known_at_compile_time
<
remove_cv_t
<
remove_reference_t
<
SrcSliceOriginIdx
>>>::
value
,
"wrong! SrcSliceOrigin need to known at compile-time"
);
"wrong! SrcSliceOrigin need to known at compile-time"
);
#if 0 // debug
// TODO: turn this on, once v5r1 is updated to use StaticBuffer for holding C data
static_assert(SrcBuffer::IsStaticBuffer(), "wrong! SrcBuffer need to be StaticBuffer");
static_assert(is_same<remove_cv_t<remove_reference_t<typename SrcBuffer::type>>,
static_assert(is_same<remove_cv_t<remove_reference_t<typename SrcBuffer::type>>,
remove_cv_t<remove_reference_t<SrcData>>>::value,
remove_cv_t<remove_reference_t<SrcData>>>::value,
"wrong! SrcBuffer data type is wrong");
"wrong! SrcBuffer data type is wrong");
#endif
// SrcDesc and src_slice_origin_idx are known at compile-time
// SrcDesc and src_slice_origin_idx are known at compile-time
constexpr
auto
src_desc
=
remove_cv_t
<
remove_reference_t
<
SrcDesc
>>
{};
constexpr
auto
src_desc
=
remove_cv_t
<
remove_reference_t
<
SrcDesc
>>
{};
...
@@ -758,6 +763,10 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
...
@@ -758,6 +763,10 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
static_assert
(
DstAddressSpace
==
AddressSpace
::
Global
or
static_assert
(
DstAddressSpace
==
AddressSpace
::
Global
or
DstAddressSpace
==
AddressSpace
::
Lds
,
DstAddressSpace
==
AddressSpace
::
Lds
,
"wrong!"
);
"wrong!"
);
// TODO: fix this
static_assert
(
is_same
<
SrcData
,
DstData
>::
value
,
"wrong! current implementation assume SrcData and DstData are same type"
);
}
}
__device__
void
SetSrcSliceOrigin
(
const
SrcDesc
&
src_desc
,
const
Index
&
src_slice_origin_idx
)
__device__
void
SetSrcSliceOrigin
(
const
SrcDesc
&
src_desc
,
const
Index
&
src_slice_origin_idx
)
...
@@ -859,11 +868,10 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
...
@@ -859,11 +868,10 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
return
src_data_idx
;
return
src_data_idx
;
}();
}();
// copy data
// copy data
from src_buf to src_tmp_vector
typename
vector_type_maker
<
SrcData
,
SrcScalarPerVector
>
::
type
src_vector
;
vector_type_maker
_t
<
SrcData
,
SrcScalarPerVector
>
src
_tmp
_vector
;
using
src_vector_t
=
using
src_vector_t
=
typename
decltype
(
src_tmp_vector
)
::
type
;
typename
vector_type_maker
<
SrcData
,
SrcScalarPerVector
>::
type
::
type
;
const
bool
is_src_valid
=
coordinate_has_valid_offset_assuming_visible_index_is_valid
(
const
bool
is_src_valid
=
coordinate_has_valid_offset_assuming_visible_index_is_valid
(
src_desc
,
src_slice_origin_coord_
);
src_desc
,
src_slice_origin_coord_
);
...
@@ -871,14 +879,14 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
...
@@ -871,14 +879,14 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
if
constexpr
(
SrcAddressSpace
==
AddressSpace
::
Global
)
if
constexpr
(
SrcAddressSpace
==
AddressSpace
::
Global
)
{
{
#if CK_USE_AMD_BUFFER_ADDRESSING
#if CK_USE_AMD_BUFFER_ADDRESSING
src_vector
.
template
AsType
<
src_vector_t
>()(
Number
<
0
>
{})
=
src_
tmp_
vector
.
template
AsType
<
src_vector_t
>()(
Number
<
0
>
{})
=
amd_buffer_load_v2
<
SrcData
,
SrcScalarPerVector
>
(
amd_buffer_load_v2
<
SrcData
,
SrcScalarPerVector
>
(
p_src
,
p_src
,
src_slice_origin_coord_
.
GetOffset
(),
src_slice_origin_coord_
.
GetOffset
(),
is_src_valid
,
is_src_valid
,
src_desc
.
GetElementSpaceSize
());
src_desc
.
GetElementSpaceSize
());
#else
#else
src_vector
.
template
AsType
<
src_vector_t
>()(
Number
<
0
>
{})
=
src_
tmp_
vector
.
template
AsType
<
src_vector_t
>()(
Number
<
0
>
{})
=
is_src_valid
?
*
reinterpret_cast
<
const
src_vector_t
*>
(
is_src_valid
?
*
reinterpret_cast
<
const
src_vector_t
*>
(
&
p_src
[
src_slice_origin_coord_
.
GetOffset
()])
&
p_src
[
src_slice_origin_coord_
.
GetOffset
()])
:
src_vector_t
{
0
};
:
src_vector_t
{
0
};
...
@@ -886,18 +894,18 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
...
@@ -886,18 +894,18 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
}
}
else
else
{
{
src_vector
.
template
AsType
<
src_vector_t
>()(
Number
<
0
>
{})
=
src_
tmp_
vector
.
template
AsType
<
src_vector_t
>()(
Number
<
0
>
{})
=
is_src_valid
?
*
reinterpret_cast
<
const
src_vector_t
*>
(
is_src_valid
?
*
reinterpret_cast
<
const
src_vector_t
*>
(
&
p_src
[
src_slice_origin_coord_
.
GetOffset
()])
&
p_src
[
src_slice_origin_coord_
.
GetOffset
()])
:
src_vector_t
{
0
};
:
src_vector_t
{
0
};
}
}
// copy data from src_tmp_vector to buffer_
static_for
<
0
,
SrcScalarPerVector
,
1
>
{}([
&
](
auto
i
)
{
static_for
<
0
,
SrcScalarPerVector
,
1
>
{}([
&
](
auto
i
)
{
constexpr
index_t
buffer_offset
=
constexpr
index_t
buffer_offset
=
buffer_desc_
.
CalculateOffset
(
src_data_idx
+
i
*
src_scalar_step_in_vector
);
buffer_desc_
.
CalculateOffset
(
src_data_idx
+
i
*
src_scalar_step_in_vector
);
buffer_
.
template
AsType
<
SrcData
>()(
Number
<
buffer_offset
>
{})
=
buffer_
(
Number
<
buffer_offset
>
{})
=
src_tmp_vector
.
template
AsType
<
SrcData
>()[
i
];
src_vector
.
template
AsType
<
SrcData
>()[
i
];
});
});
constexpr
auto
move_on_dim
=
[
&
]()
constexpr
constexpr
auto
move_on_dim
=
[
&
]()
constexpr
...
@@ -1048,21 +1056,21 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
...
@@ -1048,21 +1056,21 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
DstInMemOp
==
InMemoryDataOperation
::
Set
,
DstInMemOp
==
InMemoryDataOperation
::
Set
,
"wrong! hardcoded for ds_write"
);
"wrong! hardcoded for ds_write"
);
typename
vector_type_maker
<
DstData
,
DstScalarPerVector
>
::
type
dst_vector
;
vector_type_maker
_t
<
DstData
,
DstScalarPerVector
>
dst
_tmp
_vector
;
// copy data from buffer_ to dst_tmp_vector
static_for
<
0
,
DstScalarPerVector
,
1
>
{}([
&
](
auto
i
)
{
static_for
<
0
,
DstScalarPerVector
,
1
>
{}([
&
](
auto
i
)
{
constexpr
index_t
buffer_offset
=
constexpr
index_t
buffer_offset
=
buffer_desc_
.
CalculateOffset
(
dst_data_idx
+
i
*
dst_scalar_step_in_vector
);
buffer_desc_
.
CalculateOffset
(
dst_data_idx
+
i
*
dst_scalar_step_in_vector
);
dst_vector
.
template
AsType
<
DstData
>()(
i
)
=
dst_tmp_vector
.
template
AsType
<
DstData
>()(
i
)
=
buffer_
[
Number
<
buffer_offset
>
{}];
buffer_
.
template
AsType
<
DstData
>()[
Number
<
buffer_offset
>
{}];
});
});
using
DstVectorType
=
using
dst_vector_t
=
typename
decltype
(
dst_tmp_vector
)
::
type
;
typename
vector_type_maker
<
DstData
,
DstScalarPerVector
>::
type
::
type
;
*
reinterpret_cast
<
DstVectorType
*>
(
p_dst
+
dst_slice_origin_coord_
.
GetOffset
())
=
// copy data from dst_tmp_vector to dst_buf
dst_vector
.
template
AsType
<
DstVectorType
>()[
Number
<
0
>
{}];
*
reinterpret_cast
<
dst_vector_t
*>
(
p_dst
+
dst_slice_origin_coord_
.
GetOffset
())
=
dst_tmp_vector
.
template
AsType
<
dst_vector_t
>()[
Number
<
0
>
{}];
constexpr
auto
move_on_dim
=
[
&
]()
constexpr
constexpr
auto
move_on_dim
=
[
&
]()
constexpr
{
{
...
@@ -1319,7 +1327,7 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
...
@@ -1319,7 +1327,7 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
static
constexpr
auto
buffer_size_
=
buffer_desc_
.
GetElementSpaceSize
();
static
constexpr
auto
buffer_size_
=
buffer_desc_
.
GetElementSpaceSize
();
typename
vector_type_mak
er
<
SrcData
,
buffer_size_
>
::
type
buffer_
;
StaticBuff
er
<
SrcData
,
buffer_size_
>
buffer_
;
SrcCoord
src_slice_origin_coord_
;
SrcCoord
src_slice_origin_coord_
;
DstCoord
dst_slice_origin_coord_
;
DstCoord
dst_slice_origin_coord_
;
...
...
composable_kernel/include/tensor_operation/threadwise_gemm_v3.hpp
View file @
9d5d6afa
...
@@ -61,6 +61,7 @@ struct ThreadwiseGemm_km_kn_mn_v3
...
@@ -61,6 +61,7 @@ struct ThreadwiseGemm_km_kn_mn_v3
static_for
<
0
,
E
,
1
>
{}([
&
](
auto
e
)
{
static_for
<
0
,
E
,
1
>
{}([
&
](
auto
e
)
{
static_for
<
0
,
K
,
1
>
{}([
&
](
auto
k
)
{
static_for
<
0
,
K
,
1
>
{}([
&
](
auto
k
)
{
#if 0
constexpr auto a_offset = ADesc{}.CalculateOffset(make_tuple(e, k));
constexpr auto a_offset = ADesc{}.CalculateOffset(make_tuple(e, k));
if constexpr(H == 2 && W == 2)
if constexpr(H == 2 && W == 2)
...
@@ -123,6 +124,22 @@ struct ThreadwiseGemm_km_kn_mn_v3
...
@@ -123,6 +124,22 @@ struct ThreadwiseGemm_km_kn_mn_v3
});
});
});
});
}
}
#else
constexpr
index_t
a_offset
=
ADesc
{}.
CalculateOffset
(
make_tuple
(
e
,
k
));
static_for
<
0
,
H
,
1
>
{}([
&
](
auto
h
)
{
static_for
<
0
,
W
,
1
>
{}([
&
](
auto
w
)
{
constexpr
index_t
b_offset
=
BDesc
{}.
CalculateOffset
(
make_tuple
(
e
,
0
,
h
,
w
));
constexpr
index_t
c_offset
=
CDesc
{}.
CalculateOffset
(
make_tuple
(
k
,
0
,
h
,
w
));
amd_assembly_inner_product
(
p_a
[
Number
<
a_offset
>
{}],
p_b
[
Number
<
b_offset
>
{}],
p_c
[
Number
<
c_offset
>
{}]);
});
});
#endif
});
});
});
});
}
}
...
...
composable_kernel/include/utility/amd_inline_asm.hpp
View file @
9d5d6afa
...
@@ -36,6 +36,44 @@ __device__ void amd_assembly_inner_product(const int8x4_t& a, const int8x4_t& b,
...
@@ -36,6 +36,44 @@ __device__ void amd_assembly_inner_product(const int8x4_t& a, const int8x4_t& b,
#endif
#endif
}
}
__device__
void
amd_assembly_inner_product
(
const
int8x8_t
&
a
,
const
int8x8_t
&
b
,
int32_t
&
c
)
{
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
amd_assembly_inner_product
(
vector_type
<
int8_t
,
8
>
{
a
}.
AsType
<
int8x4_t
>
()[
I0
],
vector_type
<
int8_t
,
8
>
{
b
}.
AsType
<
int8x4_t
>
()[
I0
],
c
);
amd_assembly_inner_product
(
vector_type
<
int8_t
,
8
>
{
a
}.
AsType
<
int8x4_t
>
()[
I1
],
vector_type
<
int8_t
,
8
>
{
b
}.
AsType
<
int8x4_t
>
()[
I1
],
c
);
}
__device__
void
amd_assembly_inner_product
(
const
int8x16_t
&
a
,
const
int8x16_t
&
b
,
int32_t
&
c
)
{
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I2
=
Number
<
2
>
{};
constexpr
auto
I3
=
Number
<
3
>
{};
amd_assembly_inner_product
(
vector_type
<
int8_t
,
16
>
{
a
}.
AsType
<
int8x4_t
>
()[
I0
],
vector_type
<
int8_t
,
16
>
{
b
}.
AsType
<
int8x4_t
>
()[
I0
],
c
);
amd_assembly_inner_product
(
vector_type
<
int8_t
,
16
>
{
a
}.
AsType
<
int8x4_t
>
()[
I1
],
vector_type
<
int8_t
,
16
>
{
b
}.
AsType
<
int8x4_t
>
()[
I1
],
c
);
amd_assembly_inner_product
(
vector_type
<
int8_t
,
16
>
{
a
}.
AsType
<
int8x4_t
>
()[
I2
],
vector_type
<
int8_t
,
16
>
{
b
}.
AsType
<
int8x4_t
>
()[
I2
],
c
);
amd_assembly_inner_product
(
vector_type
<
int8_t
,
16
>
{
a
}.
AsType
<
int8x4_t
>
()[
I3
],
vector_type
<
int8_t
,
16
>
{
b
}.
AsType
<
int8x4_t
>
()[
I3
],
c
);
}
#if 0
#if 0
// c0 += inner_product(a, b0)
// c0 += inner_product(a, b0)
// c1 += inner_product(a, b1)
// c1 += inner_product(a, b1)
...
...
composable_kernel/include/utility/config.amd.hpp.in
View file @
9d5d6afa
...
@@ -14,11 +14,11 @@
...
@@ -14,11 +14,11 @@
#define CK_DEVICE_BACKEND_AMD 1
#define CK_DEVICE_BACKEND_AMD 1
// GPU ID
// GPU ID
#if
1
#if
0
#define CK_AMD_GPU_GFX906 1
#define CK_AMD_GPU_GFX906 1
#elif 0
#elif 0
#define CK_AMD_GPU_GFX908 1
#define CK_AMD_GPU_GFX908 1
#elif
0
#elif
1
#define CK_AMD_GPU_GFX1030 1
#define CK_AMD_GPU_GFX1030 1
#endif
#endif
...
...
driver/include/device_dynamic_convolution_forward_implicit_gemm_v4r4_nhwc_kyxc_nhwk.hpp
View file @
9d5d6afa
...
@@ -53,7 +53,7 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4_nhwc_kyxc_nhwk(
...
@@ -53,7 +53,7 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4_nhwc_kyxc_nhwk(
constexpr
auto
C0
=
C
/
Number
<
InWeiVectorSize
>
{};
constexpr
auto
C0
=
C
/
Number
<
InWeiVectorSize
>
{};
constexpr
auto
C1
=
Number
<
InWeiVectorSize
>
{};
constexpr
auto
C1
=
Number
<
InWeiVectorSize
>
{};
#if
1
#if
0
// run-time variables
// run-time variables
constexpr auto in_n_hi_wi_c0_desc =
constexpr auto in_n_hi_wi_c0_desc =
make_dynamic_naive_tensor_descriptor_packed_v2(make_multi_index(N, Hi, Wi, C0));
make_dynamic_naive_tensor_descriptor_packed_v2(make_multi_index(N, Hi, Wi, C0));
...
@@ -112,7 +112,7 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4_nhwc_kyxc_nhwk(
...
@@ -112,7 +112,7 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4_nhwc_kyxc_nhwk(
wei_k_y_x_c_device_buf
.
ToDevice
(
wei_k_y_x_c
.
mData
.
data
());
wei_k_y_x_c_device_buf
.
ToDevice
(
wei_k_y_x_c
.
mData
.
data
());
out_n_ho_wo_k_device_buf
.
ToDevice
(
out_n_ho_wo_k
.
mData
.
data
());
out_n_ho_wo_k_device_buf
.
ToDevice
(
out_n_ho_wo_k
.
mData
.
data
());
#if
0
#if
1
// cdata = 16, BlockSize = 64, 16x64x4
// cdata = 16, BlockSize = 64, 16x64x4
constexpr
index_t
BlockSize
=
64
;
constexpr
index_t
BlockSize
=
64
;
...
...
driver/src/conv_driver.cpp
View file @
9d5d6afa
...
@@ -64,7 +64,7 @@ int main(int argc, char* argv[])
...
@@ -64,7 +64,7 @@ int main(int argc, char* argv[])
using
LeftPads
=
Sequence
<
0
,
0
>
;
using
LeftPads
=
Sequence
<
0
,
0
>
;
using
RightPads
=
Sequence
<
0
,
0
>
;
using
RightPads
=
Sequence
<
0
,
0
>
;
#elif
0
#elif
1
constexpr
index_t
N
=
1
;
constexpr
index_t
N
=
1
;
constexpr
index_t
C
=
16
;
constexpr
index_t
C
=
16
;
constexpr
index_t
HI
=
1080
;
constexpr
index_t
HI
=
1080
;
...
@@ -630,7 +630,7 @@ int main(int argc, char* argv[])
...
@@ -630,7 +630,7 @@ int main(int argc, char* argv[])
print_array
(
"ConvStrides"
,
to_multi_index
(
ConvStrides
{}));
print_array
(
"ConvStrides"
,
to_multi_index
(
ConvStrides
{}));
print_array
(
"ConvDilations"
,
to_multi_index
(
ConvDilations
{}));
print_array
(
"ConvDilations"
,
to_multi_index
(
ConvDilations
{}));
#if
1
#if
0
using in_data_t = float;
using in_data_t = float;
constexpr index_t in_vector_size = 1;
constexpr index_t in_vector_size = 1;
using acc_data_t = float;
using acc_data_t = float;
...
@@ -724,7 +724,7 @@ int main(int argc, char* argv[])
...
@@ -724,7 +724,7 @@ int main(int argc, char* argv[])
LeftPads
{},
LeftPads
{},
RightPads
{},
RightPads
{},
nrepeat
);
nrepeat
);
#elif
1
#elif
0
device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw
<
in_data_t
,
device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw
<
in_data_t
,
in_vector_size
,
in_vector_size
,
acc_data_t
,
acc_data_t
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment