Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
95a83c6e
Commit
95a83c6e
authored
Nov 18, 2022
by
Adam Osewski
Browse files
Merge remote-tracking branch 'origin/develop' into wavelet_model
parents
5b7c2432
892a8d76
Changes
618
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
516 additions
and
124 deletions
+516
-124
example/01_gemm/gemm_xdl_skip_b_lds_fp16.cpp
example/01_gemm/gemm_xdl_skip_b_lds_fp16.cpp
+7
-5
example/01_gemm/run_gemm_example.inc
example/01_gemm/run_gemm_example.inc
+6
-8
example/02_gemm_bilinear/gemm_bilinear_xdl_fp16.cpp
example/02_gemm_bilinear/gemm_bilinear_xdl_fp16.cpp
+7
-7
example/03_gemm_bias_relu/gemm_bias_relu_xdl_fp16.cpp
example/03_gemm_bias_relu/gemm_bias_relu_xdl_fp16.cpp
+6
-5
example/04_gemm_add_add_fastgelu/run_gemm_add_add_fastgelu_example.inc
...mm_add_add_fastgelu/run_gemm_add_add_fastgelu_example.inc
+3
-3
example/09_convnd_fwd/CMakeLists.txt
example/09_convnd_fwd/CMakeLists.txt
+4
-0
example/09_convnd_fwd/convnd_fwd_common.hpp
example/09_convnd_fwd/convnd_fwd_common.hpp
+3
-2
example/09_convnd_fwd/convnd_fwd_dl_common.hpp
example/09_convnd_fwd/convnd_fwd_dl_common.hpp
+171
-0
example/09_convnd_fwd/convnd_fwd_dl_fp16.cpp
example/09_convnd_fwd/convnd_fwd_dl_fp16.cpp
+39
-0
example/09_convnd_fwd/convnd_fwd_dl_fp32.cpp
example/09_convnd_fwd/convnd_fwd_dl_fp32.cpp
+39
-0
example/09_convnd_fwd/convnd_fwd_dl_int8.cpp
example/09_convnd_fwd/convnd_fwd_dl_int8.cpp
+39
-0
example/09_convnd_fwd/run_convnd_fwd_dl_example.inc
example/09_convnd_fwd/run_convnd_fwd_dl_example.inc
+97
-0
example/10_convnd_fwd_multiple_d_multiple_reduce/common.hpp
example/10_convnd_fwd_multiple_d_multiple_reduce/common.hpp
+2
-10
example/10_convnd_fwd_multiple_d_multiple_reduce/run_convnd_fwd_max_example.inc
...multiple_d_multiple_reduce/run_convnd_fwd_max_example.inc
+12
-18
example/12_reduce/reduce_blockwise.cpp
example/12_reduce/reduce_blockwise.cpp
+5
-1
example/12_reduce/reduce_blockwise_impl.hpp
example/12_reduce/reduce_blockwise_impl.hpp
+19
-16
example/12_reduce/reduce_blockwise_two_call.cpp
example/12_reduce/reduce_blockwise_two_call.cpp
+27
-27
example/12_reduce/reduce_example_common.hpp
example/12_reduce/reduce_example_common.hpp
+7
-6
example/12_reduce/reduce_multiblock_atomic_add.cpp
example/12_reduce/reduce_multiblock_atomic_add.cpp
+5
-1
example/12_reduce/reduce_multiblock_atomic_add_impl.hpp
example/12_reduce/reduce_multiblock_atomic_add_impl.hpp
+18
-15
No files found.
example/01_gemm/gemm_xdl_skip_b_lds_fp16.cpp
View file @
95a83c6e
...
@@ -6,6 +6,8 @@
...
@@ -6,6 +6,8 @@
#include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_skip_b_lds.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_skip_b_lds.hpp"
#include "ck/library/utility/literals.hpp"
using
F16
=
ck
::
half_t
;
using
F16
=
ck
::
half_t
;
using
F32
=
float
;
using
F32
=
float
;
...
@@ -135,15 +137,15 @@ int main(int argc, char* argv[])
...
@@ -135,15 +137,15 @@ int main(int argc, char* argv[])
auto
f_host_tensor_descriptor
=
auto
f_host_tensor_descriptor
=
[](
std
::
size_t
row
,
std
::
size_t
col
,
std
::
size_t
stride
,
auto
layout
)
{
[](
std
::
size_t
row
,
std
::
size_t
col
,
std
::
size_t
stride
,
auto
layout
)
{
using
namespace
ck
::
literals
;
if
(
std
::
is_same
<
decltype
(
layout
),
ck
::
tensor_layout
::
gemm
::
RowMajor
>::
value
)
if
(
std
::
is_same
<
decltype
(
layout
),
ck
::
tensor_layout
::
gemm
::
RowMajor
>::
value
)
{
{
return
HostTensorDescriptor
(
std
::
vector
<
std
::
size_t
>
({
row
,
col
}),
return
HostTensorDescriptor
({
row
,
col
},
{
stride
,
1
_uz
});
std
::
vector
<
std
::
size_t
>
({
stride
,
1
}));
}
}
else
else
{
{
return
HostTensorDescriptor
(
std
::
vector
<
std
::
size_t
>
({
row
,
col
}),
return
HostTensorDescriptor
({
row
,
col
},
{
1
_uz
,
stride
});
std
::
vector
<
std
::
size_t
>
({
1
,
stride
}));
}
}
};
};
...
@@ -240,7 +242,7 @@ int main(int argc, char* argv[])
...
@@ -240,7 +242,7 @@ int main(int argc, char* argv[])
show_2d_matrix(std::cout << "c_host :", c_m_n_host_result) << std::endl;
show_2d_matrix(std::cout << "c_host :", c_m_n_host_result) << std::endl;
}
}
#endif
#endif
ck
::
utils
::
check_err
(
c_m_n_device_result
.
mData
,
c_m_n_host_result
.
mData
);
ck
::
utils
::
check_err
(
c_m_n_device_result
,
c_m_n_host_result
);
}
}
return
0
;
return
0
;
...
...
example/01_gemm/run_gemm_example.inc
View file @
95a83c6e
...
@@ -32,14 +32,12 @@ bool run_gemm(const ProblemSize& problem_size, const ExecutionConfig& config)
...
@@ -32,14 +32,12 @@ bool run_gemm(const ProblemSize& problem_size, const ExecutionConfig& config)
{
{
case
0
:
break
;
case
0
:
break
;
case
1
:
case
1
:
ck
::
utils
::
FillUniformDistributionIntegerValue
<
ADataType
>
{
-
5.
f
,
5.
f
}(
a_m_k
.
begin
(),
ck
::
utils
::
FillUniformDistributionIntegerValue
<
ADataType
>
{
-
5.
f
,
5.
f
}(
a_m_k
);
a_m_k
.
end
());
ck
::
utils
::
FillUniformDistributionIntegerValue
<
BDataType
>
{
-
5.
f
,
5.
f
}(
b_k_n
);
ck
::
utils
::
FillUniformDistributionIntegerValue
<
BDataType
>
{
-
5.
f
,
5.
f
}(
b_k_n
.
begin
(),
b_k_n
.
end
());
break
;
break
;
default
:
default
:
ck
::
utils
::
FillUniformDistribution
<
ADataType
>
{
-
1.
f
,
1.
f
}(
a_m_k
.
begin
(),
a_m_k
.
end
()
);
ck
::
utils
::
FillUniformDistribution
<
ADataType
>
{
-
1.
f
,
1.
f
}(
a_m_k
);
ck
::
utils
::
FillUniformDistribution
<
BDataType
>
{
-
1.
f
,
1.
f
}(
b_k_n
.
begin
(),
b_k_n
.
end
()
);
ck
::
utils
::
FillUniformDistribution
<
BDataType
>
{
-
1.
f
,
1.
f
}(
b_k_n
);
}
}
Tensor
<
CDataType
>
c_m_n_host_result
(
f_host_tensor_descriptor
(
M
,
N
,
StrideC
,
CLayout
{}));
Tensor
<
CDataType
>
c_m_n_host_result
(
f_host_tensor_descriptor
(
M
,
N
,
StrideC
,
CLayout
{}));
...
@@ -133,11 +131,11 @@ bool run_gemm(const ProblemSize& problem_size, const ExecutionConfig& config)
...
@@ -133,11 +131,11 @@ bool run_gemm(const ProblemSize& problem_size, const ExecutionConfig& config)
c_m_n_device_result
=
c_m_n_device_result_converted
.
CopyAsType
<
CDataType
>
();
c_m_n_device_result
=
c_m_n_device_result_converted
.
CopyAsType
<
CDataType
>
();
return
ck
::
utils
::
check_err
(
c_m_n_device_result_converted
.
mData
,
c_m_n_host_result
.
mData
);
return
ck
::
utils
::
check_err
(
c_m_n_device_result_converted
,
c_m_n_host_result
);
#else
#else
c_m_n_device_buf
.
FromDevice
(
c_m_n_device_result
.
mData
.
data
());
c_m_n_device_buf
.
FromDevice
(
c_m_n_device_result
.
mData
.
data
());
return
ck
::
utils
::
check_err
(
c_m_n_device_result
.
mData
,
c_m_n_host_result
.
mData
);
return
ck
::
utils
::
check_err
(
c_m_n_device_result
,
c_m_n_host_result
);
#endif
#endif
}
}
...
...
example/02_gemm_bilinear/gemm_bilinear_xdl_fp16.cpp
View file @
95a83c6e
...
@@ -14,6 +14,7 @@
...
@@ -14,6 +14,7 @@
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/literals.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/check_err.hpp"
...
@@ -177,15 +178,15 @@ int main(int argc, char* argv[])
...
@@ -177,15 +178,15 @@ int main(int argc, char* argv[])
auto
f_host_tensor_descriptor
=
auto
f_host_tensor_descriptor
=
[](
std
::
size_t
row
,
std
::
size_t
col
,
std
::
size_t
stride
,
auto
layout
)
{
[](
std
::
size_t
row
,
std
::
size_t
col
,
std
::
size_t
stride
,
auto
layout
)
{
using
namespace
ck
::
literals
;
if
(
std
::
is_same
<
decltype
(
layout
),
ck
::
tensor_layout
::
gemm
::
RowMajor
>::
value
)
if
(
std
::
is_same
<
decltype
(
layout
),
ck
::
tensor_layout
::
gemm
::
RowMajor
>::
value
)
{
{
return
HostTensorDescriptor
(
std
::
vector
<
std
::
size_t
>
({
row
,
col
}),
return
HostTensorDescriptor
({
row
,
col
},
{
stride
,
1
_uz
});
std
::
vector
<
std
::
size_t
>
({
stride
,
1
}));
}
}
else
else
{
{
return
HostTensorDescriptor
(
std
::
vector
<
std
::
size_t
>
({
row
,
col
}),
return
HostTensorDescriptor
({
row
,
col
},
{
1
_uz
,
stride
});
std
::
vector
<
std
::
size_t
>
({
1
,
stride
}));
}
}
};
};
...
@@ -271,8 +272,7 @@ int main(int argc, char* argv[])
...
@@ -271,8 +272,7 @@ int main(int argc, char* argv[])
if
(
do_verification
)
if
(
do_verification
)
{
{
Tensor
<
CShuffleDataType
>
c_m_n
(
HostTensorDescriptor
(
Tensor
<
CShuffleDataType
>
c_m_n
({
M
,
N
});
std
::
vector
<
std
::
size_t
>
{
static_cast
<
std
::
size_t
>
(
M
),
static_cast
<
std
::
size_t
>
(
N
)}));
using
ReferenceGemmInstance
=
ck
::
tensor_operation
::
host
::
ReferenceGemm
<
ADataType
,
using
ReferenceGemmInstance
=
ck
::
tensor_operation
::
host
::
ReferenceGemm
<
ADataType
,
BDataType
,
BDataType
,
...
@@ -299,7 +299,7 @@ int main(int argc, char* argv[])
...
@@ -299,7 +299,7 @@ int main(int argc, char* argv[])
e_device_buf
.
FromDevice
(
e_m_n_device_result
.
mData
.
data
());
e_device_buf
.
FromDevice
(
e_m_n_device_result
.
mData
.
data
());
return
ck
::
utils
::
check_err
(
e_m_n_device_result
.
mData
,
e_m_n_host_result
.
mData
)
?
0
:
1
;
return
ck
::
utils
::
check_err
(
e_m_n_device_result
,
e_m_n_host_result
)
?
0
:
1
;
}
}
return
0
;
return
0
;
...
...
example/03_gemm_bias_relu/gemm_bias_relu_xdl_fp16.cpp
View file @
95a83c6e
...
@@ -15,6 +15,7 @@
...
@@ -15,6 +15,7 @@
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/literals.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/check_err.hpp"
...
@@ -155,15 +156,15 @@ int main(int argc, char* argv[])
...
@@ -155,15 +156,15 @@ int main(int argc, char* argv[])
auto
f_host_tensor_descriptor
=
auto
f_host_tensor_descriptor
=
[](
std
::
size_t
row
,
std
::
size_t
col
,
std
::
size_t
stride
,
auto
layout
)
{
[](
std
::
size_t
row
,
std
::
size_t
col
,
std
::
size_t
stride
,
auto
layout
)
{
using
namespace
ck
::
literals
;
if
(
std
::
is_same
<
decltype
(
layout
),
ck
::
tensor_layout
::
gemm
::
RowMajor
>::
value
)
if
(
std
::
is_same
<
decltype
(
layout
),
ck
::
tensor_layout
::
gemm
::
RowMajor
>::
value
)
{
{
return
HostTensorDescriptor
(
std
::
vector
<
std
::
size_t
>
({
row
,
col
}),
return
HostTensorDescriptor
({
row
,
col
},
{
stride
,
1
_uz
});
std
::
vector
<
std
::
size_t
>
({
stride
,
1
}));
}
}
else
else
{
{
return
HostTensorDescriptor
(
std
::
vector
<
std
::
size_t
>
({
row
,
col
}),
return
HostTensorDescriptor
({
row
,
col
},
{
1
_uz
,
stride
});
std
::
vector
<
std
::
size_t
>
({
1
,
stride
}));
}
}
};
};
...
@@ -275,7 +276,7 @@ int main(int argc, char* argv[])
...
@@ -275,7 +276,7 @@ int main(int argc, char* argv[])
}
}
}
}
return
ck
::
utils
::
check_err
(
e_m_n_device_result
.
mData
,
e_m_n_host_result
.
mData
)
?
0
:
1
;
return
ck
::
utils
::
check_err
(
e_m_n_device_result
,
e_m_n_host_result
)
?
0
:
1
;
}
}
return
0
;
return
0
;
...
...
example/04_gemm_add_add_fastgelu/run_gemm_add_add_fastgelu_example.inc
View file @
95a83c6e
...
@@ -124,7 +124,7 @@ bool run_gemm_add_add_fastgelu(const ProblemSize& problem_size, const ExecutionC
...
@@ -124,7 +124,7 @@ bool run_gemm_add_add_fastgelu(const ProblemSize& problem_size, const ExecutionC
if
(
config
.
do_verification
)
if
(
config
.
do_verification
)
{
{
Tensor
<
AccDataType
>
c_m_n
(
HostTensorDescriptor
{
M
,
N
});
Tensor
<
AccDataType
>
c_m_n
({
M
,
N
});
auto
ref_gemm
=
ReferenceGemmInstance
{};
auto
ref_gemm
=
ReferenceGemmInstance
{};
auto
ref_invoker
=
ref_gemm
.
MakeInvoker
();
auto
ref_invoker
=
ref_gemm
.
MakeInvoker
();
...
@@ -147,9 +147,9 @@ bool run_gemm_add_add_fastgelu(const ProblemSize& problem_size, const ExecutionC
...
@@ -147,9 +147,9 @@ bool run_gemm_add_add_fastgelu(const ProblemSize& problem_size, const ExecutionC
#ifdef BUILD_INT4_EXAMPLE
#ifdef BUILD_INT4_EXAMPLE
const
Tensor
<
EDataType
>
e_m_n_device_result_converted
(
e_m_n_device_result
);
const
Tensor
<
EDataType
>
e_m_n_device_result_converted
(
e_m_n_device_result
);
return
ck
::
utils
::
check_err
(
e_m_n_device_result_converted
.
mData
,
e_m_n_host_result
.
mData
);
return
ck
::
utils
::
check_err
(
e_m_n_device_result_converted
,
e_m_n_host_result
);
#else
#else
return
ck
::
utils
::
check_err
(
e_m_n_device_result
.
mData
,
e_m_n_host_result
.
mData
);
return
ck
::
utils
::
check_err
(
e_m_n_device_result
,
e_m_n_host_result
);
#endif
#endif
}
}
...
...
example/09_convnd_fwd/CMakeLists.txt
View file @
95a83c6e
...
@@ -4,3 +4,7 @@ add_example_executable(example_convnd_fwd_xdl_bf16 convnd_fwd_xdl_bf16.cpp)
...
@@ -4,3 +4,7 @@ add_example_executable(example_convnd_fwd_xdl_bf16 convnd_fwd_xdl_bf16.cpp)
add_example_executable
(
example_convnd_fwd_xdl_int8 convnd_fwd_xdl_int8.cpp
)
add_example_executable
(
example_convnd_fwd_xdl_int8 convnd_fwd_xdl_int8.cpp
)
# FIXME: re-enable this exampe as test when SWDEV-335738 is fixed
# FIXME: re-enable this exampe as test when SWDEV-335738 is fixed
add_example_executable_no_testing
(
example_convnd_fwd_xdl_fp64 convnd_fwd_xdl_fp64.cpp
)
add_example_executable_no_testing
(
example_convnd_fwd_xdl_fp64 convnd_fwd_xdl_fp64.cpp
)
add_example_executable
(
example_convnd_fwd_dl_fp16 convnd_fwd_dl_fp16.cpp
)
add_example_executable
(
example_convnd_fwd_dl_fp32 convnd_fwd_dl_fp32.cpp
)
add_example_executable
(
example_convnd_fwd_dl_int8 convnd_fwd_dl_int8.cpp
)
example/09_convnd_fwd/convnd_fwd_common.hpp
View file @
95a83c6e
...
@@ -10,6 +10,7 @@
...
@@ -10,6 +10,7 @@
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/utility/algorithm.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor.hpp"
...
@@ -84,7 +85,7 @@ bool run_grouped_conv_fwd(bool do_verification,
...
@@ -84,7 +85,7 @@ bool run_grouped_conv_fwd(bool do_verification,
std
::
array
<
ck
::
index_t
,
NDimSpatial
>
input_left_pads
{};
std
::
array
<
ck
::
index_t
,
NDimSpatial
>
input_left_pads
{};
std
::
array
<
ck
::
index_t
,
NDimSpatial
>
input_right_pads
{};
std
::
array
<
ck
::
index_t
,
NDimSpatial
>
input_right_pads
{};
auto
copy
=
[](
auto
&
x
,
auto
&
y
)
{
std
::
copy
(
x
.
begin
(),
x
.
end
()
,
y
.
begin
());
};
auto
copy
=
[](
const
auto
&
x
,
auto
&
y
)
{
ck
::
ranges
::
copy
(
x
,
y
.
begin
());
};
copy
(
in_g_n_c_wis_desc
.
GetLengths
(),
a_g_n_c_wis_lengths
);
copy
(
in_g_n_c_wis_desc
.
GetLengths
(),
a_g_n_c_wis_lengths
);
copy
(
in_g_n_c_wis_desc
.
GetStrides
(),
a_g_n_c_wis_strides
);
copy
(
in_g_n_c_wis_desc
.
GetStrides
(),
a_g_n_c_wis_strides
);
...
@@ -164,7 +165,7 @@ bool run_grouped_conv_fwd(bool do_verification,
...
@@ -164,7 +165,7 @@ bool run_grouped_conv_fwd(bool do_verification,
out_device_buf
.
FromDevice
(
out_device
.
mData
.
data
());
out_device_buf
.
FromDevice
(
out_device
.
mData
.
data
());
return
ck
::
utils
::
check_err
(
return
ck
::
utils
::
check_err
(
out_device
.
mData
,
out_host
.
mData
,
"Error: incorrect results!"
,
1e-5
f
,
1e-4
f
);
out_device
,
out_host
,
"Error: incorrect results!"
,
1e-5
f
,
1e-4
f
);
}
}
return
true
;
return
true
;
...
...
example/
2
0_convnd_
b
wd
_weight
/convnd_
b
wd_
weight
_common.hpp
→
example/0
9
_convnd_
f
wd/convnd_
f
wd_
dl
_common.hpp
View file @
95a83c6e
// SPDX-License-Identifier: MIT
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <cstdlib>
#include <iostream>
#include <iostream>
#include <numeric>
#include <numeric>
#include <initializer_list>
#include <type_traits>
#include <cstdlib>
#include "ck/ck.hpp"
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/utility/algorithm.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/convolution_parameter.hpp"
#include "ck/library/utility/convolution_parameter.hpp"
#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_conv_
b
wd
_weight
.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_conv_
f
wd.hpp"
void
print_helper_msg
()
void
print_helper_msg
()
{
{
...
@@ -33,77 +34,97 @@ template <ck::index_t NDimSpatial,
...
@@ -33,77 +34,97 @@ template <ck::index_t NDimSpatial,
typename
InElementOp
,
typename
InElementOp
,
typename
WeiElementOp
,
typename
WeiElementOp
,
typename
OutElementOp
,
typename
OutElementOp
,
typename
DeviceConvBwdWeightInstance
>
typename
DeviceConvNDFwdInstance
>
int
run_conv_bwd_weight
(
bool
do_verification
,
bool
run_grouped_conv_fwd_dl
(
bool
do_verification
,
int
init_method
,
int
init_method
,
bool
time_kernel
,
bool
time_kernel
,
const
ck
::
utils
::
conv
::
ConvParam
&
conv_param
,
const
ck
::
utils
::
conv
::
ConvParam
&
conv_param
,
const
HostTensorDescriptor
&
in_g_n_c_wis_desc
,
const
HostTensorDescriptor
&
in_g_n_c_wis_desc
,
const
HostTensorDescriptor
&
wei_g_k_c_xs_desc
,
const
HostTensorDescriptor
&
wei_g_k_c_xs_desc
,
const
HostTensorDescriptor
&
out_g_n_k_wos_desc
,
const
HostTensorDescriptor
&
out_g_n_k_wos_desc
,
const
InElementOp
&
in_element_op
,
const
InElementOp
&
in_element_op
,
const
WeiElementOp
&
wei_element_op
,
const
WeiElementOp
&
wei_element_op
,
const
OutElementOp
&
out_element_op
,
const
OutElementOp
&
out_element_op
)
ck
::
index_t
split_k
)
{
{
Tensor
<
InDataType
>
in
(
in_g_n_c_wis_desc
);
Tensor
<
InDataType
>
in
(
in_g_n_c_wis_desc
);
Tensor
<
WeiDataType
>
wei
_host_result
(
wei_g_k_c_xs_desc
);
Tensor
<
WeiDataType
>
wei
(
wei_g_k_c_xs_desc
);
Tensor
<
Wei
DataType
>
wei_device_result
(
wei_g_k_c_x
s_desc
);
Tensor
<
Out
DataType
>
out_host
(
out_g_n_k_wo
s_desc
);
Tensor
<
OutDataType
>
out
(
out_g_n_k_wos_desc
);
Tensor
<
OutDataType
>
out
_device
(
out_g_n_k_wos_desc
);
std
::
cout
<<
"in: "
<<
in
.
mDesc
<<
std
::
endl
;
std
::
cout
<<
"in: "
<<
in
.
mDesc
<<
std
::
endl
;
std
::
cout
<<
"wei: "
<<
wei
_host_result
.
mDesc
<<
std
::
endl
;
std
::
cout
<<
"wei: "
<<
wei
.
mDesc
<<
std
::
endl
;
std
::
cout
<<
"out: "
<<
out
.
mDesc
<<
std
::
endl
;
std
::
cout
<<
"out: "
<<
out
_host
.
mDesc
<<
std
::
endl
;
switch
(
init_method
)
switch
(
init_method
)
{
{
case
0
:
break
;
case
0
:
break
;
case
1
:
case
1
:
in
.
GenerateTensorValue
(
GeneratorTensor_2
<
InDataType
>
{
-
5
,
5
});
in
.
GenerateTensorValue
(
GeneratorTensor_2
<
InDataType
>
{
-
5
,
5
});
out
.
GenerateTensorValue
(
GeneratorTensor_2
<
Out
DataType
>
{
-
5
,
5
});
wei
.
GenerateTensorValue
(
GeneratorTensor_2
<
Wei
DataType
>
{
-
5
,
5
});
break
;
break
;
default
:
case
2
:
in
.
GenerateTensorValue
(
GeneratorTensor_3
<
InDataType
>
{
0.0
,
1.0
});
in
.
GenerateTensorValue
(
GeneratorTensor_3
<
InDataType
>
{
0.0
,
1.0
});
out
.
GenerateTensorValue
(
GeneratorTensor_3
<
OutDataType
>
{
-
0.5
,
0.5
});
wei
.
GenerateTensorValue
(
GeneratorTensor_3
<
WeiDataType
>
{
-
0.5
,
0.5
});
break
;
default:
in
.
GenerateTensorValue
(
GeneratorTensor_1
<
InDataType
>
{
1
});
wei
.
GenerateTensorValue
(
GeneratorTensor_1
<
WeiDataType
>
{
1
});
}
}
DeviceMem
in_device_buf
(
sizeof
(
InDataType
)
*
in
.
mDesc
.
GetElementSpaceSize
());
DeviceMem
in_device_buf
(
sizeof
(
InDataType
)
*
in
.
mDesc
.
GetElementSpaceSize
());
DeviceMem
wei_device_buf
(
sizeof
(
WeiDataType
)
*
wei
_device_result
.
mDesc
.
GetElementSpaceSize
());
DeviceMem
wei_device_buf
(
sizeof
(
WeiDataType
)
*
wei
.
mDesc
.
GetElementSpaceSize
());
DeviceMem
out_device_buf
(
sizeof
(
OutDataType
)
*
out
.
mDesc
.
GetElementSpaceSize
());
DeviceMem
out_device_buf
(
sizeof
(
OutDataType
)
*
out
_device
.
mDesc
.
GetElementSpaceSize
());
in_device_buf
.
ToDevice
(
in
.
mData
.
data
());
in_device_buf
.
ToDevice
(
in
.
mData
.
data
());
out_device_buf
.
ToDevice
(
out
.
mData
.
data
());
wei_device_buf
.
ToDevice
(
wei
.
mData
.
data
());
// init to 0
std
::
array
<
ck
::
index_t
,
NDimSpatial
+
3
>
a_g_n_c_wis_lengths
{};
wei_device_buf
.
SetZero
();
std
::
array
<
ck
::
index_t
,
NDimSpatial
+
3
>
a_g_n_c_wis_strides
{};
std
::
array
<
ck
::
index_t
,
NDimSpatial
+
3
>
b_g_k_c_xs_lengths
{};
// do GEMM
std
::
array
<
ck
::
index_t
,
NDimSpatial
+
3
>
b_g_k_c_xs_strides
{};
auto
conv
=
DeviceConvBwdWeightInstance
{};
std
::
array
<
ck
::
index_t
,
NDimSpatial
+
3
>
c_g_n_k_wos_lengths
{};
std
::
array
<
ck
::
index_t
,
NDimSpatial
+
3
>
c_g_n_k_wos_strides
{};
std
::
array
<
ck
::
index_t
,
NDimSpatial
>
conv_filter_strides
{};
std
::
array
<
ck
::
index_t
,
NDimSpatial
>
conv_filter_dilations
{};
std
::
array
<
ck
::
index_t
,
NDimSpatial
>
input_left_pads
{};
std
::
array
<
ck
::
index_t
,
NDimSpatial
>
input_right_pads
{};
auto
copy
=
[](
auto
&
x
,
auto
&
y
)
{
ck
::
ranges
::
copy
(
x
,
y
.
begin
());
};
copy
(
in_g_n_c_wis_desc
.
GetLengths
(),
a_g_n_c_wis_lengths
);
copy
(
in_g_n_c_wis_desc
.
GetStrides
(),
a_g_n_c_wis_strides
);
copy
(
wei_g_k_c_xs_desc
.
GetLengths
(),
b_g_k_c_xs_lengths
);
copy
(
wei_g_k_c_xs_desc
.
GetStrides
(),
b_g_k_c_xs_strides
);
copy
(
out_g_n_k_wos_desc
.
GetLengths
(),
c_g_n_k_wos_lengths
);
copy
(
out_g_n_k_wos_desc
.
GetStrides
(),
c_g_n_k_wos_strides
);
copy
(
conv_param
.
conv_filter_strides_
,
conv_filter_strides
);
copy
(
conv_param
.
conv_filter_dilations_
,
conv_filter_dilations
);
copy
(
conv_param
.
input_left_pads_
,
input_left_pads
);
copy
(
conv_param
.
input_right_pads_
,
input_right_pads
);
// do Conv
auto
conv
=
DeviceConvNDFwdInstance
{};
auto
invoker
=
conv
.
MakeInvoker
();
auto
invoker
=
conv
.
MakeInvoker
();
auto
argument
=
conv
.
MakeArgument
(
static_cast
<
InDataType
*>
(
in_device_buf
.
GetDeviceBuffer
()
)
,
auto
argument
=
conv
.
MakeArgument
(
in_device_buf
.
GetDeviceBuffer
(),
static_cast
<
WeiDataType
*>
(
wei_device_buf
.
GetDeviceBuffer
()
)
,
wei_device_buf
.
GetDeviceBuffer
(),
static_cast
<
OutDataType
*>
(
out_device_buf
.
GetDeviceBuffer
()
)
,
out_device_buf
.
GetDeviceBuffer
(),
conv_param
.
N_
,
a_g_n_c_wis_lengths
,
conv_param
.
K_
,
a_g_n_c_wis_strides
,
conv_param
.
C_
,
b_g_k_c_xs_lengths
,
conv_param
.
input_spatial_lengths_
,
b_g_k_c_xs_strides
,
c
onv_param
.
filter_spatial
_lengths
_
,
c
_g_n_k_wos
_lengths
,
c
onv_param
.
output_spatial_lengths_
,
c
_g_n_k_wos_strides
,
conv_param
.
conv_filter_strides
_
,
conv_filter_strides
,
conv_param
.
conv_filter_dilations
_
,
conv_filter_dilations
,
conv_param
.
input_left_pads
_
,
input_left_pads
,
conv_param
.
input_right_pads
_
,
input_right_pads
,
in_element_op
,
in_element_op
,
wei_element_op
,
wei_element_op
,
out_element_op
,
out_element_op
);
split_k
);
if
(
!
conv
.
IsSupportedArgument
(
argument
))
if
(
!
conv
.
IsSupportedArgument
(
argument
))
{
{
std
::
cout
<<
"wrong! device_conv with the specified compilation parameters does "
return
true
;
"not support this Conv problem"
<<
std
::
endl
;
return
1
;
}
}
float
avg_time
=
invoker
.
Run
(
argument
,
StreamConfig
{
nullptr
,
time_kernel
});
float
avg_time
=
invoker
.
Run
(
argument
,
StreamConfig
{
nullptr
,
time_kernel
});
...
@@ -111,42 +132,40 @@ int run_conv_bwd_weight(bool do_verification,
...
@@ -111,42 +132,40 @@ int run_conv_bwd_weight(bool do_verification,
std
::
size_t
flop
=
conv_param
.
GetFlops
();
std
::
size_t
flop
=
conv_param
.
GetFlops
();
std
::
size_t
num_btype
=
conv_param
.
GetByte
<
InDataType
,
WeiDataType
,
OutDataType
>
();
std
::
size_t
num_btype
=
conv_param
.
GetByte
<
InDataType
,
WeiDataType
,
OutDataType
>
();
float
tflops
=
static_cast
<
float
>
(
flop
)
/
1.E9
/
avg_time
;
float
tflops
=
static_cast
<
float
>
(
flop
)
/
1.E9
/
avg_time
;
float
gb_per_sec
=
num_btype
/
1.E6
/
avg_time
;
float
gb_per_sec
=
num_btype
/
1.E6
/
avg_time
;
std
::
cout
<<
"Perf: "
<<
avg_time
<<
" ms, "
<<
tflops
<<
" TFlops, "
<<
gb_per_sec
<<
" GB/s, "
std
::
cout
<<
"Perf: "
<<
avg_time
<<
" ms, "
<<
tflops
<<
" TFlops, "
<<
gb_per_sec
<<
" GB/s"
<<
conv
.
GetTypeString
()
<<
std
::
endl
;
<<
conv
.
GetTypeString
()
<<
std
::
endl
;
if
(
do_verification
)
if
(
do_verification
)
{
{
auto
ref_conv
=
ck
::
tensor_operation
::
host
::
ReferenceConvBwdWeight
<
NDimSpatial
,
auto
ref_conv
=
ck
::
tensor_operation
::
host
::
ReferenceConvFwd
<
NDimSpatial
,
InDataType
,
InDataType
,
WeiDataType
,
WeiDataType
,
OutDataType
,
OutDataType
,
InElementOp
,
InElementOp
,
WeiElementOp
,
WeiElementOp
,
OutElementOp
>
{};
OutElementOp
>
();
auto
ref_invoker
=
ref_conv
.
MakeInvoker
();
auto
ref_invoker
=
ref_conv
.
MakeInvoker
();
auto
ref_argument
=
ref_conv
.
MakeArgument
(
in
,
auto
ref_argument
=
ref_conv
.
MakeArgument
(
in
,
wei
_host_result
,
wei
,
out
,
out
_host
,
conv_param
.
conv_filter_strides_
,
conv_param
.
conv_filter_strides_
,
conv_param
.
conv_filter_dilations_
,
conv_param
.
conv_filter_dilations_
,
conv_param
.
input_left_pads_
,
conv_param
.
input_left_pads_
,
conv_param
.
input_right_pads_
,
conv_param
.
input_right_pads_
,
InE
lement
Op
{}
,
in_e
lement
_op
,
W
ei
E
lement
Op
{}
,
w
ei
_e
lement
_op
,
O
ut
E
lement
Op
{}
);
o
ut
_e
lement
_op
);
ref_invoker
.
Run
(
ref_argument
);
ref_invoker
.
Run
(
ref_argument
);
wei
_device_buf
.
FromDevice
(
wei
_device
_result
.
mData
.
data
());
out
_device_buf
.
FromDevice
(
out
_device
.
mData
.
data
());
return
ck
::
utils
::
check_err
(
wei_device_result
.
mData
,
wei_host_result
.
mData
)
?
0
:
1
;
return
ck
::
utils
::
check_err
(
out_device
.
mData
,
out_host
.
mData
,
"Error: incorrect results!"
,
1e-5
f
,
1e-4
f
);
}
}
return
0
;
return
true
;
}
}
example/09_convnd_fwd/convnd_fwd_dl_fp16.cpp
0 → 100644
View file @
95a83c6e
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "convnd_fwd_dl_common.hpp"
#include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd_dl_nhwc_kyxc_nhwk.hpp"
#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
using
InDataType
=
ck
::
half_t
;
using
WeiDataType
=
ck
::
half_t
;
using
AccDataType
=
float
;
using
OutDataType
=
ck
::
half_t
;
template
<
ck
::
index_t
...
Is
>
using
S
=
ck
::
Sequence
<
Is
...
>
;
using
InElementOp
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
WeiElementOp
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
OutElementOp
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
static
constexpr
auto
ConvSpec
=
ck
::
tensor_operation
::
device
::
ConvolutionForwardSpecialization
::
Default
;
static
constexpr
auto
GemmPadingSpec
=
ck
::
tensor_operation
::
device
::
GemmSpecialization
::
MNKPadding
;
template
<
ck
::
index_t
NDimSpatial
,
typename
InLayout
,
typename
WeiLayout
,
typename
OutLayout
>
// clang-format off
using
DeviceGroupedConvNDFwdInstance
=
ck
::
tensor_operation
::
device
::
DeviceGroupedConvFwdDl_NHWC_KYXC_NHWK
// ######| NDim| InData| WeiData| OutData| AccData| InLayout| WeiLayout| OutLayout| In| Wei| Out| Convolution| GEMM| Block| MPer| NPer| K0Per| K1| M1Per| N1Per| KPer| M11N11Thread| M11N11Thread| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| CThreadTransfer| CThreadTransfer| CThreadTransfer|
// ######| Spatial| Type| Type| Type| Type| | | | Elementwise| Elementwise| Elementwise| Forward| Spacialization| Size| Block| Block| Block| | ThreadM111| ThreadN111| Thread| ClusterM110Xs| ClusterN110Xs| ThreadSliceLengths| ThreadClusterLengths| ThreadCluster| SrcAccess| SrcVectorTensor| SrcVectorTensor| DstVectorTensor| ThreadSliceLengths| ThreadClusterLengths| ThreadCluster| SrcAccess| SrcVectorTensor| SrcVectorTensor| DstVectorTensor| SrcDstAccess| SrcDstVectorDim| DstScalarPerVector|
// ######| | | | | | | | | Operation| Operation| Operation| Specialization| | | | | | | | | | | | K0_M0_M1_K1| K0_M0_M1_K1| ArrangeOrder| Order| Lengths_K0_M0_M1_K1| ContiguousDimOrder| Lengths_K0_M0_M1_K1| K0_N0_N1_K1| K0_N0_N1_K1| ArrangeOrder| Order| Lengths_K0_N0_N1_K1| ContiguousDimOrder| Lengths_K0_N0_N1_K1| Order| | |
// ######| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
<
NDimSpatial
,
InDataType
,
WeiDataType
,
OutDataType
,
AccDataType
,
InLayout
,
WeiLayout
,
OutLayout
,
InElementOp
,
WeiElementOp
,
OutElementOp
,
ConvSpec
,
GemmPadingSpec
,
256
,
128
,
128
,
16
,
2
,
4
,
4
,
1
,
S
<
8
,
2
>
,
S
<
8
,
2
>
,
S
<
8
,
1
,
1
,
2
>
,
S
<
2
,
1
,
128
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
8
,
1
,
1
,
2
>
,
S
<
2
,
1
,
128
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
2
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
2
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
4
>
;
// clang-format on
#include "run_convnd_fwd_dl_example.inc"
int
main
(
int
argc
,
char
*
argv
[])
{
return
run_convnd_fwd_dl_example
(
argc
,
argv
)
?
0
:
1
;
}
example/09_convnd_fwd/convnd_fwd_dl_fp32.cpp
0 → 100644
View file @
95a83c6e
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "convnd_fwd_dl_common.hpp"
#include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd_dl_nhwc_kyxc_nhwk.hpp"
#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
using
InDataType
=
float
;
using
WeiDataType
=
float
;
using
AccDataType
=
float
;
using
OutDataType
=
float
;
template
<
ck
::
index_t
...
Is
>
using
S
=
ck
::
Sequence
<
Is
...
>
;
using
InElementOp
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
WeiElementOp
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
OutElementOp
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
static
constexpr
auto
ConvSpec
=
ck
::
tensor_operation
::
device
::
ConvolutionForwardSpecialization
::
Default
;
static
constexpr
auto
GemmPadingSpec
=
ck
::
tensor_operation
::
device
::
GemmSpecialization
::
MNKPadding
;
template
<
ck
::
index_t
NDimSpatial
,
typename
InLayout
,
typename
WeiLayout
,
typename
OutLayout
>
// clang-format off
using
DeviceGroupedConvNDFwdInstance
=
ck
::
tensor_operation
::
device
::
DeviceGroupedConvFwdDl_NHWC_KYXC_NHWK
// ######| NDim| InData| WeiData| OutData| AccData| InLayout| WeiLayout| OutLayout| In| Wei| Out| Convolution| GEMM| Block| MPer| NPer| K0Per| K1| M1Per| N1Per| KPer| M11N11Thread| M11N11Thread| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| CThreadTransfer| CThreadTransfer| CThreadTransfer|
// ######| Spatial| Type| Type| Type| Type| | | | Elementwise| Elementwise| Elementwise| Forward| Spacialization| Size| Block| Block| Block| | ThreadM111| ThreadN111| Thread| ClusterM110Xs| ClusterN110Xs| ThreadSliceLengths| ThreadClusterLengths| ThreadCluster| SrcAccess| SrcVectorTensor| SrcVectorTensor| DstVectorTensor| ThreadSliceLengths| ThreadClusterLengths| ThreadCluster| SrcAccess| SrcVectorTensor| SrcVectorTensor| DstVectorTensor| SrcDstAccess| SrcDstVectorDim| DstScalarPerVector|
// ######| | | | | | | | | Operation| Operation| Operation| Specialization| | | | | | | | | | | | K0_M0_M1_K1| K0_M0_M1_K1| ArrangeOrder| Order| Lengths_K0_M0_M1_K1| ContiguousDimOrder| Lengths_K0_M0_M1_K1| K0_N0_N1_K1| K0_N0_N1_K1| ArrangeOrder| Order| Lengths_K0_N0_N1_K1| ContiguousDimOrder| Lengths_K0_N0_N1_K1| Order| | |
// ######| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
<
NDimSpatial
,
InDataType
,
WeiDataType
,
OutDataType
,
AccDataType
,
InLayout
,
WeiLayout
,
OutLayout
,
InElementOp
,
WeiElementOp
,
OutElementOp
,
ConvSpec
,
GemmPadingSpec
,
256
,
128
,
128
,
16
,
1
,
4
,
4
,
1
,
S
<
8
,
2
>
,
S
<
8
,
2
>
,
S
<
8
,
1
,
1
,
1
>
,
S
<
2
,
1
,
128
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
1
>
,
S
<
8
,
1
,
1
,
1
>
,
S
<
2
,
1
,
128
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
1
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
4
>
;
// clang-format on
#include "run_convnd_fwd_dl_example.inc"
int
main
(
int
argc
,
char
*
argv
[])
{
return
run_convnd_fwd_dl_example
(
argc
,
argv
)
?
0
:
1
;
}
example/09_convnd_fwd/convnd_fwd_dl_int8.cpp
0 → 100644
View file @
95a83c6e
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "convnd_fwd_dl_common.hpp"
#include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd_dl_nhwc_kyxc_nhwk.hpp"
#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
using
InDataType
=
int8_t
;
using
WeiDataType
=
int8_t
;
using
AccDataType
=
int32_t
;
using
OutDataType
=
int8_t
;
template
<
ck
::
index_t
...
Is
>
using
S
=
ck
::
Sequence
<
Is
...
>
;
using
InElementOp
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
WeiElementOp
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
OutElementOp
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
static
constexpr
auto
ConvSpec
=
ck
::
tensor_operation
::
device
::
ConvolutionForwardSpecialization
::
Default
;
static
constexpr
auto
GemmPadingSpec
=
ck
::
tensor_operation
::
device
::
GemmSpecialization
::
MNKPadding
;
template
<
ck
::
index_t
NDimSpatial
,
typename
InLayout
,
typename
WeiLayout
,
typename
OutLayout
>
// clang-format off
using
DeviceGroupedConvNDFwdInstance
=
ck
::
tensor_operation
::
device
::
DeviceGroupedConvFwdDl_NHWC_KYXC_NHWK
// ######| NDim| InData| WeiData| OutData| AccData| InLayout| WeiLayout| OutLayout| In| Wei| Out| Convolution| GEMM| Block| MPer| NPer| K0Per| K1| M1Per| N1Per| KPer| M11N11Thread| M11N11Thread| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| CThreadTransfer| CThreadTransfer| CThreadTransfer|
// ######| Spatial| Type| Type| Type| Type| | | | Elementwise| Elementwise| Elementwise| Forward| Spacialization| Size| Block| Block| Block| | ThreadM111| ThreadN111| Thread| ClusterM110Xs| ClusterN110Xs| ThreadSliceLengths| ThreadClusterLengths| ThreadCluster| SrcAccess| SrcVectorTensor| SrcVectorTensor| DstVectorTensor| ThreadSliceLengths| ThreadClusterLengths| ThreadCluster| SrcAccess| SrcVectorTensor| SrcVectorTensor| DstVectorTensor| SrcDstAccess| SrcDstVectorDim| DstScalarPerVector|
// ######| | | | | | | | | Operation| Operation| Operation| Specialization| | | | | | | | | | | | K0_M0_M1_K1| K0_M0_M1_K1| ArrangeOrder| Order| Lengths_K0_M0_M1_K1| ContiguousDimOrder| Lengths_K0_M0_M1_K1| K0_N0_N1_K1| K0_N0_N1_K1| ArrangeOrder| Order| Lengths_K0_N0_N1_K1| ContiguousDimOrder| Lengths_K0_N0_N1_K1| Order| | |
// ######| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
<
NDimSpatial
,
InDataType
,
WeiDataType
,
OutDataType
,
AccDataType
,
InLayout
,
WeiLayout
,
OutLayout
,
InElementOp
,
WeiElementOp
,
OutElementOp
,
ConvSpec
,
GemmPadingSpec
,
256
,
128
,
128
,
16
,
4
,
4
,
4
,
1
,
S
<
8
,
2
>
,
S
<
8
,
2
>
,
S
<
8
,
1
,
1
,
4
>
,
S
<
2
,
1
,
128
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
4
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
4
>
,
S
<
8
,
1
,
1
,
4
>
,
S
<
2
,
1
,
128
,
1
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
4
,
1
,
1
,
4
>
,
S
<
1
,
2
,
0
,
3
>
,
S
<
1
,
1
,
1
,
4
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
4
>
;
// clang-format on
#include "run_convnd_fwd_dl_example.inc"
int
main
(
int
argc
,
char
*
argv
[])
{
return
run_convnd_fwd_dl_example
(
argc
,
argv
)
?
0
:
1
;
}
example/09_convnd_fwd/run_convnd_fwd_dl_example.inc
0 → 100644
View file @
95a83c6e
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
bool
run_convnd_fwd_dl_example
(
int
argc
,
char
*
argv
[])
{
print_helper_msg
();
bool
do_verification
=
true
;
int
init_method
=
1
;
bool
time_kernel
=
false
;
ck
::
utils
::
conv
::
ConvParam
conv_param
{
2
,
1
,
128
,
256
,
192
,
{
3
,
3
},
{
71
,
71
},
{
2
,
2
},
{
1
,
1
},
{
1
,
1
},
{
1
,
1
}};
if
(
argc
==
1
)
{
// use default
}
else
if
(
argc
==
4
)
{
do_verification
=
std
::
stoi
(
argv
[
1
]);
init_method
=
std
::
stoi
(
argv
[
2
]);
time_kernel
=
std
::
stoi
(
argv
[
3
]);
}
else
{
do_verification
=
std
::
stoi
(
argv
[
1
]);
init_method
=
std
::
stoi
(
argv
[
2
]);
time_kernel
=
std
::
stoi
(
argv
[
3
]);
const
ck
::
index_t
num_dim_spatial
=
std
::
stoi
(
argv
[
4
]);
conv_param
=
ck
::
utils
::
conv
::
parse_conv_param
(
num_dim_spatial
,
5
,
argv
);
}
const
auto
in_element_op
=
InElementOp
{};
const
auto
wei_element_op
=
WeiElementOp
{};
const
auto
out_element_op
=
OutElementOp
{};
const
auto
run
=
[
&
](
auto
ndim_spatial
,
auto
in_layout
,
auto
wei_layout
,
auto
out_layout
)
{
constexpr
ck
::
index_t
ndim_spatial_value
=
ndim_spatial
.
value
;
std
::
cout
<<
"ndim_spatial_value: "
<<
ndim_spatial_value
<<
std
::
endl
;
using
InLayout
=
decltype
(
in_layout
);
using
WeiLayout
=
decltype
(
wei_layout
);
using
OutLayout
=
decltype
(
out_layout
);
const
auto
in_g_n_c_wis_desc
=
ck
::
utils
::
conv
::
make_input_host_tensor_descriptor_g_n_c_wis_packed
<
InLayout
>
(
conv_param
);
const
auto
wei_g_k_c_xs_desc
=
ck
::
utils
::
conv
::
make_weight_host_tensor_descriptor_g_k_c_xs_packed
<
WeiLayout
>
(
conv_param
);
const
auto
out_g_n_k_wos_desc
=
ck
::
utils
::
conv
::
make_output_host_tensor_descriptor_g_n_k_wos_packed
<
OutLayout
>
(
conv_param
);
return
run_grouped_conv_fwd_dl
<
ndim_spatial_value
,
InDataType
,
WeiDataType
,
OutDataType
,
InElementOp
,
WeiElementOp
,
OutElementOp
,
DeviceGroupedConvNDFwdInstance
<
ndim_spatial_value
,
InLayout
,
WeiLayout
,
OutLayout
>>
(
do_verification
,
init_method
,
time_kernel
,
conv_param
,
in_g_n_c_wis_desc
,
wei_g_k_c_xs_desc
,
out_g_n_k_wos_desc
,
in_element_op
,
wei_element_op
,
out_element_op
);
};
namespace
ctc
=
ck
::
tensor_layout
::
convolution
;
if
(
conv_param
.
num_dim_spatial_
==
1
)
{
return
run
(
ck
::
Number
<
1
>
{},
ctc
::
GNWC
{},
ctc
::
GKXC
{},
ctc
::
GNWK
{});
}
else
if
(
conv_param
.
num_dim_spatial_
==
2
)
{
return
run
(
ck
::
Number
<
2
>
{},
ctc
::
GNHWC
{},
ctc
::
GKYXC
{},
ctc
::
GNHWK
{});
}
else
if
(
conv_param
.
num_dim_spatial_
==
3
)
{
return
run
(
ck
::
Number
<
3
>
{},
ctc
::
GNDHWC
{},
ctc
::
GKZYXC
{},
ctc
::
GNDHWK
{});
}
return
true
;
}
example/10_convnd_fwd_multiple_d_multiple_reduce/common.hpp
View file @
95a83c6e
...
@@ -16,6 +16,7 @@
...
@@ -16,6 +16,7 @@
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/utility/algorithm.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/convolution_parameter.hpp"
#include "ck/library/utility/convolution_parameter.hpp"
#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
...
@@ -140,9 +141,7 @@ make_r0_host_tensor_descriptor(const ck::utils::conv::ConvParam& problem_size)
...
@@ -140,9 +141,7 @@ make_r0_host_tensor_descriptor(const ck::utils::conv::ConvParam& problem_size)
{
{
std
::
vector
<
ck
::
index_t
>
dimensions
{
problem_size
.
G_
,
problem_size
.
N_
};
std
::
vector
<
ck
::
index_t
>
dimensions
{
problem_size
.
G_
,
problem_size
.
N_
};
std
::
copy
(
begin
(
problem_size
.
output_spatial_lengths_
),
ck
::
ranges
::
copy
(
problem_size
.
output_spatial_lengths_
,
std
::
back_inserter
(
dimensions
));
end
(
problem_size
.
output_spatial_lengths_
),
std
::
back_inserter
(
dimensions
));
return
HostTensorDescriptor
(
dimensions
);
return
HostTensorDescriptor
(
dimensions
);
}
}
...
@@ -158,10 +157,3 @@ void unpack_host_tensor_descriptor(const HostTensorDescriptor& descriptor,
...
@@ -158,10 +157,3 @@ void unpack_host_tensor_descriptor(const HostTensorDescriptor& descriptor,
assert
(
size
(
descriptor
.
GetStrides
())
==
size
(
strides
));
assert
(
size
(
descriptor
.
GetStrides
())
==
size
(
strides
));
std
::
copy_n
(
begin
(
descriptor
.
GetStrides
()),
size
(
descriptor
.
GetStrides
()),
begin
(
strides
));
std
::
copy_n
(
begin
(
descriptor
.
GetStrides
()),
size
(
descriptor
.
GetStrides
()),
begin
(
strides
));
}
}
template
<
typename
Range
,
typename
OutputIterator
>
auto
copy
(
const
Range
&
range
,
OutputIterator
iter
)
->
decltype
(
std
::
copy
(
std
::
begin
(
range
),
std
::
end
(
range
),
iter
))
{
return
std
::
copy
(
std
::
begin
(
range
),
std
::
end
(
range
),
iter
);
}
example/10_convnd_fwd_multiple_d_multiple_reduce/run_convnd_fwd_max_example.inc
View file @
95a83c6e
...
@@ -77,15 +77,12 @@ bool run_convnd_fwd_max(const ck::utils::conv::ConvParam& problem_size,
...
@@ -77,15 +77,12 @@ bool run_convnd_fwd_max(const ck::utils::conv::ConvParam& problem_size,
{
{
case
0
:
break
;
case
0
:
break
;
case
1
:
case
1
:
ck
::
utils
::
FillUniformDistributionIntegerValue
<
ADataType
>
{
-
8
,
7
}(
conv_input
.
begin
(),
ck
::
utils
::
FillUniformDistributionIntegerValue
<
ADataType
>
{
-
8
,
7
}(
conv_input
);
conv_input
.
end
());
ck
::
utils
::
FillUniformDistributionIntegerValue
<
BDataType
>
{
-
8
,
7
}(
conv_weight
);
ck
::
utils
::
FillUniformDistributionIntegerValue
<
BDataType
>
{
-
8
,
7
}(
conv_weight
.
begin
(),
conv_weight
.
end
());
break
;
break
;
default
:
default
:
ck
::
utils
::
FillUniformDistribution
<
ADataType
>
{
-
5
,
5
}(
conv_input
.
begin
(),
conv_input
.
end
());
ck
::
utils
::
FillUniformDistribution
<
ADataType
>
{
-
5
,
5
}(
conv_input
);
ck
::
utils
::
FillUniformDistribution
<
BDataType
>
{
-
5
,
5
}(
conv_weight
.
begin
(),
ck
::
utils
::
FillUniformDistribution
<
BDataType
>
{
-
5
,
5
}(
conv_weight
);
conv_weight
.
end
());
}
}
DeviceMem
conv_input_device_buf
(
sizeof
(
ADataType
)
*
conv_input
.
mDesc
.
GetElementSpaceSize
());
DeviceMem
conv_input_device_buf
(
sizeof
(
ADataType
)
*
conv_input
.
mDesc
.
GetElementSpaceSize
());
...
@@ -123,10 +120,10 @@ bool run_convnd_fwd_max(const ck::utils::conv::ConvParam& problem_size,
...
@@ -123,10 +120,10 @@ bool run_convnd_fwd_max(const ck::utils::conv::ConvParam& problem_size,
conv_output_g_n_k_wos_desc
,
conv_output_g_n_k_wos_lengths
,
conv_output_g_n_k_wos_strides
);
conv_output_g_n_k_wos_desc
,
conv_output_g_n_k_wos_lengths
,
conv_output_g_n_k_wos_strides
);
unpack_host_tensor_descriptor
(
r0_desc
,
r0_lengths
,
r0_strides
);
unpack_host_tensor_descriptor
(
r0_desc
,
r0_lengths
,
r0_strides
);
copy
(
problem_size
.
conv_filter_strides_
,
begin
(
conv_filter_strides
));
ck
::
ranges
::
copy
(
problem_size
.
conv_filter_strides_
,
begin
(
conv_filter_strides
));
copy
(
problem_size
.
conv_filter_dilations_
,
begin
(
conv_filter_dilations
));
ck
::
ranges
::
copy
(
problem_size
.
conv_filter_dilations_
,
begin
(
conv_filter_dilations
));
copy
(
problem_size
.
input_left_pads_
,
begin
(
input_left_pads
));
ck
::
ranges
::
copy
(
problem_size
.
input_left_pads_
,
begin
(
input_left_pads
));
copy
(
problem_size
.
input_right_pads_
,
begin
(
input_right_pads
));
ck
::
ranges
::
copy
(
problem_size
.
input_right_pads_
,
begin
(
input_right_pads
));
// run Conv + Reduction on device
// run Conv + Reduction on device
auto
conv
=
DeviceInstance
<
NDimSpatial
>
{};
auto
conv
=
DeviceInstance
<
NDimSpatial
>
{};
...
@@ -276,16 +273,13 @@ bool run_convnd_fwd_max(const ck::utils::conv::ConvParam& problem_size,
...
@@ -276,16 +273,13 @@ bool run_convnd_fwd_max(const ck::utils::conv::ConvParam& problem_size,
conv_output_device_buf
.
FromDevice
(
conv_output_device
.
mData
.
data
());
conv_output_device_buf
.
FromDevice
(
conv_output_device
.
mData
.
data
());
r0_device_buf
.
FromDevice
(
r0_device
.
mData
.
data
());
r0_device_buf
.
FromDevice
(
r0_device
.
mData
.
data
());
return
ck
::
utils
::
check_err
(
conv_output_device
.
mData
,
return
ck
::
utils
::
check_err
(
conv_output_device
,
conv_output_host
.
mData
,
conv_output_host
,
"Error: incorrect results! (Matrix E)"
,
"Error: incorrect results! (Matrix E)"
,
1
e
-
5
f
,
1
e
-
5
f
,
1
e
-
4
f
)
&&
1
e
-
4
f
)
&&
ck
::
utils
::
check_err
(
r0_device
.
mData
,
ck
::
utils
::
check_err
(
r0_host
.
mData
,
r0_device
,
r0_host
,
"Error: incorrect results! (Matrix R0)"
,
1
e
-
5
f
,
1
e
-
4
f
);
"Error: incorrect results! (Matrix R0)"
,
1
e
-
5
f
,
1
e
-
4
f
);
}
}
return
true
;
return
true
;
...
...
example/12_reduce/reduce_blockwise.cpp
View file @
95a83c6e
...
@@ -140,6 +140,10 @@ bool reduce_blockwise_test(bool do_verification,
...
@@ -140,6 +140,10 @@ bool reduce_blockwise_test(bool do_verification,
if
(
ShapeType
::
Rank_
!=
inLengths
.
size
()
||
ShapeType
::
NumReduceDim_
!=
reduceDims
.
size
())
if
(
ShapeType
::
Rank_
!=
inLengths
.
size
()
||
ShapeType
::
NumReduceDim_
!=
reduceDims
.
size
())
return
;
return
;
std
::
array
<
int
,
ShapeType
::
NumReduceDim_
>
arrReduceDims
;
ck
::
ranges
::
copy
(
reduceDims
,
arrReduceDims
.
begin
());
result
=
reduce_blockwise_impl
<
InOutDataType
,
result
=
reduce_blockwise_impl
<
InOutDataType
,
AccDataType
,
AccDataType
,
ReduceOpId
,
ReduceOpId
,
...
@@ -147,7 +151,7 @@ bool reduce_blockwise_test(bool do_verification,
...
@@ -147,7 +151,7 @@ bool reduce_blockwise_test(bool do_verification,
ShapeType
::
NumReduceDim_
,
ShapeType
::
NumReduceDim_
,
PropagateNan
,
PropagateNan
,
OutputIndex
>
(
OutputIndex
>
(
do_verification
,
init_method
,
time_kernel
,
inLengths
,
r
educeDims
,
alpha
,
beta
);
do_verification
,
init_method
,
time_kernel
,
inLengths
,
arrR
educeDims
,
alpha
,
beta
);
matched
=
true
;
matched
=
true
;
});
});
...
...
example/12_reduce/reduce_blockwise_impl.hpp
View file @
95a83c6e
...
@@ -10,6 +10,7 @@
...
@@ -10,6 +10,7 @@
#include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp"
#include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_reduce_multiblock.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_reduce_multiblock.hpp"
#include "ck/library/utility/algorithm.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor.hpp"
...
@@ -30,7 +31,7 @@ int reduce_blockwise_impl(bool do_verification,
...
@@ -30,7 +31,7 @@ int reduce_blockwise_impl(bool do_verification,
int
init_method
,
int
init_method
,
bool
time_kernel
,
bool
time_kernel
,
const
std
::
vector
<
size_t
>&
inLengths
,
const
std
::
vector
<
size_t
>&
inLengths
,
const
std
::
vector
<
int
>&
reduceDims
,
const
std
::
array
<
int
,
NumReduceDim
>&
reduceDims
,
float
alpha
,
float
alpha
,
float
beta
)
float
beta
)
...
@@ -38,6 +39,8 @@ int reduce_blockwise_impl(bool do_verification,
...
@@ -38,6 +39,8 @@ int reduce_blockwise_impl(bool do_verification,
using
namespace
ck
;
using
namespace
ck
;
using
namespace
ck
::
tensor_operation
::
device
;
using
namespace
ck
::
tensor_operation
::
device
;
constexpr
index_t
NumOutDim
=
(
Rank
-
NumReduceDim
==
0
)
?
1
:
Rank
-
NumReduceDim
;
constexpr
bool
op_support_indices
=
constexpr
bool
op_support_indices
=
(
ReduceOpId
==
ReduceTensorOp
::
MIN
||
ReduceOpId
==
ReduceTensorOp
::
MAX
||
(
ReduceOpId
==
ReduceTensorOp
::
MIN
||
ReduceOpId
==
ReduceTensorOp
::
MAX
||
ReduceOpId
==
ReduceTensorOp
::
AMAX
);
ReduceOpId
==
ReduceTensorOp
::
AMAX
);
...
@@ -143,7 +146,7 @@ int reduce_blockwise_impl(bool do_verification,
...
@@ -143,7 +146,7 @@ int reduce_blockwise_impl(bool do_verification,
std
::
vector
<
size_t
>
outLengths
;
std
::
vector
<
size_t
>
outLengths
;
std
::
vector
<
int
>
invariantDims
=
get_invariant_dims
<
Rank
,
NumReduceDim
>
(
reduceDims
);
auto
invariantDims
=
get_invariant_dims
<
Rank
,
NumReduceDim
>
(
reduceDims
);
if
(
invariantDims
.
empty
())
if
(
invariantDims
.
empty
())
outLengths
.
push_back
(
1
);
outLengths
.
push_back
(
1
);
...
@@ -256,22 +259,22 @@ int reduce_blockwise_impl(bool do_verification,
...
@@ -256,22 +259,22 @@ int reduce_blockwise_impl(bool do_verification,
acc_elementwise_op
);
acc_elementwise_op
);
};
};
std
::
vector
<
ck
::
index_t
>
i_i
nLengths
;
std
::
array
<
index_t
,
Rank
>
arrI
nLengths
;
std
::
vector
<
ck
::
index_t
>
i_i
nStrides
;
std
::
array
<
index_t
,
Rank
>
arrI
nStrides
;
std
::
vector
<
ck
::
index_t
>
i_o
utLengths
;
std
::
array
<
index_t
,
NumOutDim
>
arrO
utLengths
;
std
::
vector
<
ck
::
index_t
>
i_o
utStrides
;
std
::
array
<
index_t
,
NumOutDim
>
arrO
utStrides
;
i_inLengths
.
assign
(
inLengths
.
begin
(),
i
nLengths
.
end
());
ck
::
ranges
::
copy
(
inLengths
,
arrI
nLengths
.
begin
());
i_inStrides
.
assign
(
inStrides
.
begin
(),
i
nStrides
.
end
());
ck
::
ranges
::
copy
(
inStrides
,
arrI
nStrides
.
begin
());
i_outLengths
.
assign
(
outLengths
.
begin
(),
o
utLengths
.
end
());
ck
::
ranges
::
copy
(
outLengths
,
arrO
utLengths
.
begin
());
i_outStrides
.
assign
(
outStrides
.
begin
(),
o
utStrides
.
end
());
ck
::
ranges
::
copy
(
outStrides
,
arrO
utStrides
.
begin
());
auto
reduce
=
DeviceReduceInstance
{};
auto
reduce
=
DeviceReduceInstance
{};
auto
argument_ptr
=
reduce
.
MakeArgumentPointer
(
i_i
nLengths
,
auto
argument_ptr
=
reduce
.
MakeArgumentPointer
(
arrI
nLengths
,
i_i
nStrides
,
arrI
nStrides
,
i_o
utLengths
,
arrO
utLengths
,
i_o
utStrides
,
arrO
utStrides
,
reduceDims
,
reduceDims
,
alpha
,
alpha
,
beta
,
beta
,
...
@@ -322,12 +325,12 @@ int reduce_blockwise_impl(bool do_verification,
...
@@ -322,12 +325,12 @@ int reduce_blockwise_impl(bool do_verification,
#endif
#endif
out_dev
.
FromDevice
(
out
.
mData
.
data
());
out_dev
.
FromDevice
(
out
.
mData
.
data
());
pass
=
pass
&&
ck
::
utils
::
check_err
(
out
.
mData
,
out_ref
.
mData
);
pass
=
pass
&&
ck
::
utils
::
check_err
(
out
,
out_ref
);
if
(
OutputIndex
)
if
(
OutputIndex
)
{
{
out_index_dev
.
FromDevice
(
out_indices
.
mData
.
data
());
out_index_dev
.
FromDevice
(
out_indices
.
mData
.
data
());
pass
=
pass
&&
ck
::
utils
::
check_err
(
out_indices
.
mData
,
out_indices_ref
.
mData
);
pass
=
pass
&&
ck
::
utils
::
check_err
(
out_indices
,
out_indices_ref
);
};
};
};
};
...
...
example/12_reduce/reduce_blockwise_two_call.cpp
View file @
95a83c6e
...
@@ -90,15 +90,15 @@ static bool time_kernel;
...
@@ -90,15 +90,15 @@ static bool time_kernel;
int
main
(
int
argc
,
char
*
argv
[])
int
main
(
int
argc
,
char
*
argv
[])
{
{
// used by the device reduction
// used by the device reduction
const
std
::
vector
<
int
>
reduceDims_1
=
{
4
};
const
std
::
array
<
int
,
1
>
reduceDims_1
=
{
4
};
const
std
::
vector
<
int
>
invariantDims_1
=
{
0
,
1
,
2
,
3
};
//
const std::
array
<int
, 4
> invariantDims_1 = {0, 1, 2, 3};
const
std
::
vector
<
int
>
reduceDims_2
=
{
3
};
const
std
::
array
<
int
,
1
>
reduceDims_2
=
{
3
};
const
std
::
vector
<
int
>
invariantDims_2
=
{
0
,
1
,
2
};
//
const std::
array
<int
, 3
> invariantDims_2 = {0, 1, 2};
// used by the host reduction
// used by the host reduction
const
std
::
vector
<
int
>
reduceDims
=
{
3
,
4
};
const
std
::
array
<
int
,
2
>
reduceDims
=
{
3
,
4
};
const
std
::
vector
<
int
>
invariantDims
=
{
0
,
1
,
2
};
const
std
::
array
<
int
,
3
>
invariantDims
=
{
0
,
1
,
2
};
const
std
::
vector
<
size_t
>
inLengths_1
=
{
64
,
320
,
80
,
4
,
128
};
const
std
::
vector
<
size_t
>
inLengths_1
=
{
64
,
320
,
80
,
4
,
128
};
...
@@ -214,26 +214,26 @@ int main(int argc, char* argv[])
...
@@ -214,26 +214,26 @@ int main(int argc, char* argv[])
acc_elementwise_op
);
acc_elementwise_op
);
};
};
std
::
vector
<
ck
::
index_t
>
i_i
nLengths_1
;
std
::
array
<
index_t
,
5
>
arrI
nLengths_1
;
std
::
vector
<
ck
::
index_t
>
i_i
nStrides_1
;
std
::
array
<
index_t
,
5
>
arrI
nStrides_1
;
std
::
vector
<
ck
::
index_t
>
i_i
nLengths_2
;
std
::
array
<
index_t
,
4
>
arrI
nLengths_2
;
std
::
vector
<
ck
::
index_t
>
i_i
nStrides_2
;
std
::
array
<
index_t
,
4
>
arrI
nStrides_2
;
std
::
vector
<
ck
::
index_t
>
i_o
utLengths
;
std
::
array
<
index_t
,
3
>
arrO
utLengths
;
std
::
vector
<
ck
::
index_t
>
i_o
utStrides
;
std
::
array
<
index_t
,
3
>
arrO
utStrides
;
i_inLengths_1
.
assign
(
inLengths_1
.
begin
(),
i
nLengths_1
.
end
());
ck
::
ranges
::
copy
(
inLengths_1
,
arrI
nLengths_1
.
begin
());
i_inStrides_1
.
assign
(
inStrides_1
.
begin
(),
i
nStrides_1
.
end
());
ck
::
ranges
::
copy
(
inStrides_1
,
arrI
nStrides_1
.
begin
());
i_inLengths_2
.
assign
(
inLengths_2
.
begin
(),
i
nLengths_2
.
end
());
ck
::
ranges
::
copy
(
inLengths_2
,
arrI
nLengths_2
.
begin
());
i_inStrides_2
.
assign
(
inStrides_2
.
begin
(),
i
nStrides_2
.
end
());
ck
::
ranges
::
copy
(
inStrides_2
,
arrI
nStrides_2
.
begin
());
i_outLengths
.
assign
(
outLengths
.
begin
(),
o
utLengths
.
end
());
ck
::
ranges
::
copy
(
outLengths
,
arrO
utLengths
.
begin
());
i_outStrides
.
assign
(
outStrides
.
begin
(),
o
utStrides
.
end
());
ck
::
ranges
::
copy
(
outStrides
,
arrO
utStrides
.
begin
());
auto
reduce_1
=
DeviceReduceInstance_1
{};
auto
reduce_1
=
DeviceReduceInstance_1
{};
auto
argument_ptr_1
=
reduce_1
.
MakeArgumentPointer
(
i_i
nLengths_1
,
auto
argument_ptr_1
=
reduce_1
.
MakeArgumentPointer
(
arrI
nLengths_1
,
i_i
nStrides_1
,
arrI
nStrides_1
,
i_i
nLengths_2
,
arrI
nLengths_2
,
i_i
nStrides_2
,
arrI
nStrides_2
,
reduceDims_1
,
reduceDims_1
,
1.0
f
,
1.0
f
,
0.0
f
,
0.0
f
,
...
@@ -255,10 +255,10 @@ int main(int argc, char* argv[])
...
@@ -255,10 +255,10 @@ int main(int argc, char* argv[])
auto
reduce_2
=
DeviceReduceInstance_2
{};
auto
reduce_2
=
DeviceReduceInstance_2
{};
auto
argument_ptr_2
=
reduce_2
.
MakeArgumentPointer
(
i_i
nLengths_2
,
auto
argument_ptr_2
=
reduce_2
.
MakeArgumentPointer
(
arrI
nLengths_2
,
i_i
nStrides_2
,
arrI
nStrides_2
,
i_o
utLengths
,
arrO
utLengths
,
i_o
utStrides
,
arrO
utStrides
,
reduceDims_2
,
reduceDims_2
,
alpha
,
alpha
,
beta
,
beta
,
...
@@ -294,7 +294,7 @@ int main(int argc, char* argv[])
...
@@ -294,7 +294,7 @@ int main(int argc, char* argv[])
if
(
do_verify
)
if
(
do_verify
)
{
{
out_dev
.
FromDevice
(
out
.
mData
.
data
());
out_dev
.
FromDevice
(
out
.
mData
.
data
());
pass
=
pass
&&
ck
::
utils
::
check_err
(
out
.
mData
,
out_ref
.
mData
);
pass
=
pass
&&
ck
::
utils
::
check_err
(
out
,
out_ref
);
};
};
return
(
pass
?
0
:
1
);
return
(
pass
?
0
:
1
);
...
...
example/12_reduce/reduce_example_common.hpp
View file @
95a83c6e
...
@@ -5,11 +5,10 @@
...
@@ -5,11 +5,10 @@
#include "ck/ck.hpp"
#include "ck/ck.hpp"
template
<
ck
::
index_t
Rank
,
ck
::
index_t
NumReduceDim
>
template
<
int
Rank
,
int
NumReduceDim
>
std
::
vector
<
int
>
get_invariant_dims
(
const
std
::
vector
<
int
>&
reduceDims
)
static
inline
std
::
array
<
int
,
Rank
-
NumReduceDim
>
get_invariant_dims
(
const
std
::
array
<
int
,
NumReduceDim
>&
reduceDims
)
{
{
assert
(
NumReduceDim
==
reduceDims
.
size
());
int
reduceFlag
=
0
;
int
reduceFlag
=
0
;
// flag the bits for the reduceDims
// flag the bits for the reduceDims
...
@@ -18,13 +17,15 @@ std::vector<int> get_invariant_dims(const std::vector<int>& reduceDims)
...
@@ -18,13 +17,15 @@ std::vector<int> get_invariant_dims(const std::vector<int>& reduceDims)
reduceFlag
|=
1
<<
reduceDims
[
i
];
reduceFlag
|=
1
<<
reduceDims
[
i
];
};
};
std
::
vector
<
int
>
invariantDims
;
std
::
array
<
int
,
Rank
-
NumReduceDim
>
invariantDims
;
// collect invariant dimensions
// collect invariant dimensions
int
dim
=
0
;
for
(
int
i
=
0
;
i
<
Rank
;
i
++
)
for
(
int
i
=
0
;
i
<
Rank
;
i
++
)
if
((
reduceFlag
&
(
1
<<
i
))
==
0
)
if
((
reduceFlag
&
(
1
<<
i
))
==
0
)
{
{
invariantDims
.
push_back
(
i
);
invariantDims
[
dim
]
=
i
;
dim
++
;
};
};
return
invariantDims
;
return
invariantDims
;
...
...
example/12_reduce/reduce_multiblock_atomic_add.cpp
View file @
95a83c6e
...
@@ -138,13 +138,17 @@ bool reduce_multiblock_atomic_add_test(bool do_verification,
...
@@ -138,13 +138,17 @@ bool reduce_multiblock_atomic_add_test(bool do_verification,
if
(
ShapeType
::
Rank_
!=
inLengths
.
size
()
||
ShapeType
::
NumReduceDim_
!=
reduceDims
.
size
())
if
(
ShapeType
::
Rank_
!=
inLengths
.
size
()
||
ShapeType
::
NumReduceDim_
!=
reduceDims
.
size
())
return
;
return
;
std
::
array
<
int
,
ShapeType
::
NumReduceDim_
>
a_reduceDims
;
ck
::
ranges
::
copy
(
reduceDims
,
a_reduceDims
.
begin
());
result
=
reduce_multiblock_atomic_add_impl
<
InOutDataType
,
result
=
reduce_multiblock_atomic_add_impl
<
InOutDataType
,
AccDataType
,
AccDataType
,
ReduceOpId
,
ReduceOpId
,
ShapeType
::
Rank_
,
ShapeType
::
Rank_
,
ShapeType
::
NumReduceDim_
,
ShapeType
::
NumReduceDim_
,
PropagateNan
>
(
PropagateNan
>
(
do_verification
,
init_method
,
time_kernel
,
inLengths
,
reduceDims
,
alpha
,
beta
);
do_verification
,
init_method
,
time_kernel
,
inLengths
,
a_
reduceDims
,
alpha
,
beta
);
matched
=
true
;
matched
=
true
;
});
});
...
...
example/12_reduce/reduce_multiblock_atomic_add_impl.hpp
View file @
95a83c6e
...
@@ -10,6 +10,7 @@
...
@@ -10,6 +10,7 @@
#include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp"
#include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_reduce_multiblock.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_reduce_multiblock.hpp"
#include "ck/library/utility/algorithm.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor.hpp"
...
@@ -29,7 +30,7 @@ int reduce_multiblock_atomic_add_impl(bool do_verification,
...
@@ -29,7 +30,7 @@ int reduce_multiblock_atomic_add_impl(bool do_verification,
int
init_method
,
int
init_method
,
bool
time_kernel
,
bool
time_kernel
,
const
std
::
vector
<
size_t
>&
inLengths
,
const
std
::
vector
<
size_t
>&
inLengths
,
const
std
::
vector
<
int
>&
reduceDims
,
const
std
::
array
<
int
,
NumReduceDim
>&
reduceDims
,
float
alpha
,
float
alpha
,
float
beta
)
float
beta
)
...
@@ -37,6 +38,8 @@ int reduce_multiblock_atomic_add_impl(bool do_verification,
...
@@ -37,6 +38,8 @@ int reduce_multiblock_atomic_add_impl(bool do_verification,
using
namespace
ck
;
using
namespace
ck
;
using
namespace
ck
::
tensor_operation
::
device
;
using
namespace
ck
::
tensor_operation
::
device
;
constexpr
index_t
NumOutDim
=
(
Rank
-
NumReduceDim
==
0
)
?
1
:
Rank
-
NumReduceDim
;
constexpr
bool
op_support_atomic_add
=
constexpr
bool
op_support_atomic_add
=
(
ReduceOpId
==
ReduceTensorOp
::
ADD
||
ReduceOpId
==
ReduceTensorOp
::
AVG
);
(
ReduceOpId
==
ReduceTensorOp
::
ADD
||
ReduceOpId
==
ReduceTensorOp
::
AVG
);
...
@@ -84,7 +87,7 @@ int reduce_multiblock_atomic_add_impl(bool do_verification,
...
@@ -84,7 +87,7 @@ int reduce_multiblock_atomic_add_impl(bool do_verification,
std
::
vector
<
size_t
>
outLengths
;
std
::
vector
<
size_t
>
outLengths
;
std
::
vector
<
int
>
invariantDims
=
get_invariant_dims
<
Rank
,
NumReduceDim
>
(
reduceDims
);
auto
invariantDims
=
get_invariant_dims
<
Rank
,
NumReduceDim
>
(
reduceDims
);
if
(
invariantDims
.
empty
())
if
(
invariantDims
.
empty
())
outLengths
.
push_back
(
1
);
outLengths
.
push_back
(
1
);
...
@@ -169,22 +172,22 @@ int reduce_multiblock_atomic_add_impl(bool do_verification,
...
@@ -169,22 +172,22 @@ int reduce_multiblock_atomic_add_impl(bool do_verification,
acc_elementwise_op
);
acc_elementwise_op
);
};
};
std
::
vector
<
ck
::
index_t
>
i_i
nLengths
;
std
::
array
<
index_t
,
Rank
>
arrI
nLengths
;
std
::
vector
<
ck
::
index_t
>
i_i
nStrides
;
std
::
array
<
index_t
,
Rank
>
arrI
nStrides
;
std
::
vector
<
ck
::
index_t
>
i_o
utLengths
;
std
::
array
<
index_t
,
NumOutDim
>
arrO
utLengths
;
std
::
vector
<
ck
::
index_t
>
i_o
utStrides
;
std
::
array
<
index_t
,
NumOutDim
>
arrO
utStrides
;
i_inLengths
.
assign
(
inLengths
.
begin
(),
i
nLengths
.
end
());
ck
::
ranges
::
copy
(
inLengths
,
arrI
nLengths
.
begin
());
i_inStrides
.
assign
(
inStrides
.
begin
(),
i
nStrides
.
end
());
ck
::
ranges
::
copy
(
inStrides
,
arrI
nStrides
.
begin
());
i_outLengths
.
assign
(
outLengths
.
begin
(),
o
utLengths
.
end
());
ck
::
ranges
::
copy
(
outLengths
,
arrO
utLengths
.
begin
());
i_outStrides
.
assign
(
outStrides
.
begin
(),
o
utStrides
.
end
());
ck
::
ranges
::
copy
(
outStrides
,
arrO
utStrides
.
begin
());
auto
reduce
=
DeviceReduceInstance
{};
auto
reduce
=
DeviceReduceInstance
{};
auto
argument_ptr
=
reduce
.
MakeArgumentPointer
(
i_i
nLengths
,
auto
argument_ptr
=
reduce
.
MakeArgumentPointer
(
arrI
nLengths
,
i_i
nStrides
,
arrI
nStrides
,
i_o
utLengths
,
arrO
utLengths
,
i_o
utStrides
,
arrO
utStrides
,
reduceDims
,
reduceDims
,
alpha
,
alpha
,
beta
,
beta
,
...
@@ -223,7 +226,7 @@ int reduce_multiblock_atomic_add_impl(bool do_verification,
...
@@ -223,7 +226,7 @@ int reduce_multiblock_atomic_add_impl(bool do_verification,
if
(
do_verification
)
if
(
do_verification
)
{
{
out_dev
.
FromDevice
(
out
.
mData
.
data
());
out_dev
.
FromDevice
(
out
.
mData
.
data
());
pass
=
pass
&&
ck
::
utils
::
check_err
(
out
.
mData
,
out_ref
.
mData
);
pass
=
pass
&&
ck
::
utils
::
check_err
(
out
,
out_ref
);
};
};
return
(
pass
?
0
:
1
);
return
(
pass
?
0
:
1
);
...
...
Prev
1
2
3
4
5
6
…
31
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment