Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
66d93ae5
Commit
66d93ae5
authored
Jul 12, 2022
by
rocking
Browse files
Rename Reduce -> R
parent
63914743
Changes
28
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
149 additions
and
153 deletions
+149
-153
library/src/tensor_operation_instance/gpu/gemm_bias_add_reduce/device_gemm_bias_add_mean_squaremean_xdl_cshuffle_f16_f16_f16_f32_f32_mk_nk_mn_instance.cpp
...an_xdl_cshuffle_f16_f16_f16_f32_f32_mk_nk_mn_instance.cpp
+16
-16
library/src/tensor_operation_instance/gpu/gemm_reduce/device_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_km_kn_mn_instance.cpp
...ce_xdl_cshuffle_f16_f16_f16_f32_f32_km_kn_mn_instance.cpp
+20
-20
library/src/tensor_operation_instance/gpu/gemm_reduce/device_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_km_nk_mn_instance.cpp
...ce_xdl_cshuffle_f16_f16_f16_f32_f32_km_nk_mn_instance.cpp
+20
-20
library/src/tensor_operation_instance/gpu/gemm_reduce/device_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_mk_kn_mn_instance.cpp
...ce_xdl_cshuffle_f16_f16_f16_f32_f32_mk_kn_mn_instance.cpp
+20
-20
library/src/tensor_operation_instance/gpu/gemm_reduce/device_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_mk_nk_mn_instance.cpp
...ce_xdl_cshuffle_f16_f16_f16_f32_f32_mk_nk_mn_instance.cpp
+17
-17
profiler/include/profile_batched_gemm_reduce_impl.hpp
profiler/include/profile_batched_gemm_reduce_impl.hpp
+17
-19
profiler/include/profile_gemm_bias_add_reduce_impl.hpp
profiler/include/profile_gemm_bias_add_reduce_impl.hpp
+22
-23
profiler/include/profile_gemm_reduce_impl.hpp
profiler/include/profile_gemm_reduce_impl.hpp
+17
-18
No files found.
library/src/tensor_operation_instance/gpu/gemm_bias_add_reduce/device_gemm_bias_add_mean_squaremean_xdl_cshuffle_f16_f16_f16_f32_f32_mk_nk_mn_instance.cpp
View file @
66d93ae5
This diff is collapsed.
Click to expand it.
library/src/tensor_operation_instance/gpu/gemm_reduce/device_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_km_kn_mn_instance.cpp
View file @
66d93ae5
This diff is collapsed.
Click to expand it.
library/src/tensor_operation_instance/gpu/gemm_reduce/device_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_km_nk_mn_instance.cpp
View file @
66d93ae5
This diff is collapsed.
Click to expand it.
library/src/tensor_operation_instance/gpu/gemm_reduce/device_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_mk_kn_mn_instance.cpp
View file @
66d93ae5
This diff is collapsed.
Click to expand it.
library/src/tensor_operation_instance/gpu/gemm_reduce/device_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_mk_nk_mn_instance.cpp
View file @
66d93ae5
This diff is collapsed.
Click to expand it.
profiler/include/profile_batched_gemm_reduce_impl.hpp
View file @
66d93ae5
...
...
@@ -23,14 +23,14 @@ namespace instance {
using
F32
=
float
;
using
F16
=
ck
::
half_t
;
using
R
educe
PtrsGlobal
=
ck
::
Tuple
<
F32
*
,
F32
*>
;
using
RPtrsGlobal
=
ck
::
Tuple
<
F32
*
,
F32
*>
;
using
Identity
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
Square
=
ck
::
tensor_operation
::
element_wise
::
UnarySquare
;
using
ReduceInElementOps
=
ck
::
Tuple
<
Identity
,
Square
>
;
using
ReduceOutElementOps
=
ck
::
Tuple
<
Identity
,
Identity
>
;
using
DeviceGemmReduceNoOpPtr
=
ck
::
tensor_operation
::
device
::
DeviceGemmReducePtr
<
0
,
R
educe
PtrsGlobal
::
Size
()
>
;
ck
::
tensor_operation
::
device
::
DeviceGemmReducePtr
<
0
,
RPtrsGlobal
::
Size
()
>
;
void
add_device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gmk_gkn_gmn_instances
(
std
::
vector
<
DeviceGemmReduceNoOpPtr
>&
);
...
...
@@ -55,7 +55,7 @@ namespace profiler {
template
<
typename
ADataType
,
typename
BDataType
,
typename
CDataType
,
typename
R
educe
DataType
,
typename
RDataType
,
typename
ALayout
,
typename
BLayout
,
typename
CLayout
>
...
...
@@ -95,16 +95,16 @@ bool profile_batched_gemm_reduce_impl(int do_verification,
Tensor
<
CDataType
>
c_g_m_n_host_result
(
f_host_tensor_descriptor
(
BatchCount
,
M
,
N
,
StrideC
,
CLayout
{}));
Tensor
<
R
educe
DataType
>
d0_g_m_host_result
(
HostTensorDescriptor
(
std
::
vector
<
std
::
size_t
>
(
Tensor
<
RDataType
>
d0_g_m_host_result
(
HostTensorDescriptor
(
std
::
vector
<
std
::
size_t
>
(
{
static_cast
<
std
::
size_t
>
(
BatchCount
),
static_cast
<
std
::
size_t
>
(
M
)})));
Tensor
<
R
educe
DataType
>
d1_g_m_host_result
(
HostTensorDescriptor
(
std
::
vector
<
std
::
size_t
>
(
Tensor
<
RDataType
>
d1_g_m_host_result
(
HostTensorDescriptor
(
std
::
vector
<
std
::
size_t
>
(
{
static_cast
<
std
::
size_t
>
(
BatchCount
),
static_cast
<
std
::
size_t
>
(
M
)})));
Tensor
<
CDataType
>
c_g_m_n_device_result
(
f_host_tensor_descriptor
(
BatchCount
,
M
,
N
,
StrideC
,
CLayout
{}));
Tensor
<
R
educe
DataType
>
d0_g_m_device_result
(
HostTensorDescriptor
(
std
::
vector
<
std
::
size_t
>
(
Tensor
<
RDataType
>
d0_g_m_device_result
(
HostTensorDescriptor
(
std
::
vector
<
std
::
size_t
>
(
{
static_cast
<
std
::
size_t
>
(
BatchCount
),
static_cast
<
std
::
size_t
>
(
M
)})));
Tensor
<
R
educe
DataType
>
d1_g_m_device_result
(
HostTensorDescriptor
(
std
::
vector
<
std
::
size_t
>
(
Tensor
<
RDataType
>
d1_g_m_device_result
(
HostTensorDescriptor
(
std
::
vector
<
std
::
size_t
>
(
{
static_cast
<
std
::
size_t
>
(
BatchCount
),
static_cast
<
std
::
size_t
>
(
M
)})));
std
::
cout
<<
"a_g_m_k: "
<<
a_g_m_k
.
mDesc
<<
std
::
endl
;
...
...
@@ -159,7 +159,7 @@ bool profile_batched_gemm_reduce_impl(int do_verification,
BElementOp
,
CElementOp
>
;
using
R
educe
AccDataType
=
R
educe
DataType
;
using
RAccDataType
=
RDataType
;
auto
ref_batched_gemm
=
ReferenceBatchedGemmInstance
{};
auto
ref_invoker
=
ref_batched_gemm
.
MakeInvoker
();
...
...
@@ -173,22 +173,22 @@ bool profile_batched_gemm_reduce_impl(int do_verification,
{
for
(
int
m
=
0
;
m
<
M
;
++
m
)
{
auto
reduce0_acc
=
reduce0_op
.
GetIdentityValue
<
R
educe
AccDataType
>
();
auto
reduce1_acc
=
reduce1_op
.
GetIdentityValue
<
R
educe
AccDataType
>
();
auto
reduce0_acc
=
reduce0_op
.
GetIdentityValue
<
RAccDataType
>
();
auto
reduce1_acc
=
reduce1_op
.
GetIdentityValue
<
RAccDataType
>
();
for
(
int
n
=
0
;
n
<
N
;
++
n
)
{
R
educe
AccDataType
d0_val
=
ck
::
type_convert
<
R
educe
AccDataType
>
(
c_g_m_n_host_result
(
batch
,
m
,
n
));
R
educe
AccDataType
d1_val
;
RAccDataType
d0_val
=
ck
::
type_convert
<
RAccDataType
>
(
c_g_m_n_host_result
(
batch
,
m
,
n
));
RAccDataType
d1_val
;
square
(
d1_val
,
d0_val
);
reduce0_op
(
reduce0_acc
,
d0_val
);
reduce1_op
(
reduce1_acc
,
d1_val
);
}
d0_g_m_host_result
(
batch
,
m
)
=
ck
::
type_convert
<
R
educe
DataType
>
(
reduce0_acc
);
d1_g_m_host_result
(
batch
,
m
)
=
ck
::
type_convert
<
R
educe
DataType
>
(
reduce1_acc
);
d0_g_m_host_result
(
batch
,
m
)
=
ck
::
type_convert
<
RDataType
>
(
reduce0_acc
);
d1_g_m_host_result
(
batch
,
m
)
=
ck
::
type_convert
<
RDataType
>
(
reduce1_acc
);
}
}
}
...
...
@@ -196,10 +196,8 @@ bool profile_batched_gemm_reduce_impl(int do_verification,
DeviceMem
a_device_buf
(
sizeof
(
ADataType
)
*
a_g_m_k
.
mDesc
.
GetElementSpace
());
DeviceMem
b_device_buf
(
sizeof
(
BDataType
)
*
b_g_k_n
.
mDesc
.
GetElementSpace
());
DeviceMem
c_device_buf
(
sizeof
(
CDataType
)
*
c_g_m_n_device_result
.
mDesc
.
GetElementSpace
());
DeviceMem
reduce0_device_buf
(
sizeof
(
ReduceDataType
)
*
d0_g_m_device_result
.
mDesc
.
GetElementSpace
());
DeviceMem
reduce1_device_buf
(
sizeof
(
ReduceDataType
)
*
d1_g_m_device_result
.
mDesc
.
GetElementSpace
());
DeviceMem
reduce0_device_buf
(
sizeof
(
RDataType
)
*
d0_g_m_device_result
.
mDesc
.
GetElementSpace
());
DeviceMem
reduce1_device_buf
(
sizeof
(
RDataType
)
*
d1_g_m_device_result
.
mDesc
.
GetElementSpace
());
std
::
array
<
void
*
,
2
>
p_reduces
=
{
reduce0_device_buf
.
GetDeviceBuffer
(),
reduce1_device_buf
.
GetDeviceBuffer
()};
...
...
profiler/include/profile_gemm_bias_add_reduce_impl.hpp
View file @
66d93ae5
...
...
@@ -23,7 +23,7 @@ namespace instance {
using
F32
=
float
;
using
F16
=
ck
::
half_t
;
using
R
educe
PtrsGlobal
=
ck
::
Tuple
<
F32
*
,
F32
*>
;
using
RPtrsGlobal
=
ck
::
Tuple
<
F32
*
,
F32
*>
;
using
Div
=
ck
::
tensor_operation
::
element_wise
::
UnaryDivide
;
using
Identity
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
Square
=
ck
::
tensor_operation
::
element_wise
::
UnarySquare
;
...
...
@@ -31,7 +31,7 @@ using ReduceInElementOps = ck::Tuple<Identity, Square>;
using
ReduceOutElementOps
=
ck
::
Tuple
<
Div
,
Div
>
;
using
DeviceGemmBiasAddReduceNoOpPtr
=
ck
::
tensor_operation
::
device
::
DeviceGemmReducePtr
<
1
,
R
educe
PtrsGlobal
::
Size
()
>
;
ck
::
tensor_operation
::
device
::
DeviceGemmReducePtr
<
1
,
RPtrsGlobal
::
Size
()
>
;
void
add_device_gemm_bias_add_mean_squaremean_xdl_cshuffle_f16_f16_f16_f16_f16_f32_f32_mk_kn_mn_instances
(
std
::
vector
<
DeviceGemmBiasAddReduceNoOpPtr
>&
);
...
...
@@ -58,7 +58,7 @@ template <typename ADataType,
typename
CDataType
,
typename
BiasDataType
,
typename
D0DataType
,
typename
R
educe
DataType
,
typename
RDataType
,
typename
ALayout
,
typename
BLayout
,
typename
CLayout
>
...
...
@@ -99,15 +99,15 @@ void profile_gemm_bias_add_reduce_impl(int do_verification,
Tensor
<
CDataType
>
c_m_n_host_result
(
f_host_tensor_descriptor2d
(
M
,
N
,
StrideC
,
CLayout
{}));
Tensor
<
BiasDataType
>
bias_n
(
f_host_tensor_descriptor1d
(
N
,
1
));
Tensor
<
D0DataType
>
d0_m_n
(
f_host_tensor_descriptor2d
(
M
,
N
,
StrideC
,
CLayout
{}));
Tensor
<
R
educe
DataType
>
reduce0_m_host_result
(
Tensor
<
RDataType
>
reduce0_m_host_result
(
HostTensorDescriptor
(
std
::
vector
<
std
::
size_t
>
({
static_cast
<
std
::
size_t
>
(
M
)})));
Tensor
<
R
educe
DataType
>
reduce1_m_host_result
(
Tensor
<
RDataType
>
reduce1_m_host_result
(
HostTensorDescriptor
(
std
::
vector
<
std
::
size_t
>
({
static_cast
<
std
::
size_t
>
(
M
)})));
Tensor
<
CDataType
>
c_m_n_device_result
(
f_host_tensor_descriptor2d
(
M
,
N
,
StrideC
,
CLayout
{}));
Tensor
<
R
educe
DataType
>
reduce0_m_device_result
(
Tensor
<
RDataType
>
reduce0_m_device_result
(
HostTensorDescriptor
(
std
::
vector
<
std
::
size_t
>
({
static_cast
<
std
::
size_t
>
(
M
)})));
Tensor
<
R
educe
DataType
>
reduce1_m_device_result
(
Tensor
<
RDataType
>
reduce1_m_device_result
(
HostTensorDescriptor
(
std
::
vector
<
std
::
size_t
>
({
static_cast
<
std
::
size_t
>
(
M
)})));
std
::
cout
<<
"a_m_k: "
<<
a_m_k
.
mDesc
<<
std
::
endl
;
...
...
@@ -166,12 +166,12 @@ void profile_gemm_bias_add_reduce_impl(int do_verification,
using
ReferenceGemmInstance
=
ck
::
tensor_operation
::
host
::
ReferenceGemm
<
ADataType
,
BDataType
,
CDataType
,
R
educe
DataType
,
RDataType
,
AElementOp
,
BElementOp
,
CElementOp
>
;
using
R
educe
AccDataType
=
R
educe
DataType
;
using
RAccDataType
=
RDataType
;
auto
ref_gemm
=
ReferenceGemmInstance
{};
auto
ref_invoker
=
ref_gemm
.
MakeInvoker
();
...
...
@@ -184,10 +184,10 @@ void profile_gemm_bias_add_reduce_impl(int do_verification,
for
(
int
m
=
0
;
m
<
M
;
++
m
)
for
(
int
n
=
0
;
n
<
N
;
++
n
)
{
R
educe
AccDataType
acc
=
static_cast
<
R
educe
AccDataType
>
(
c_m_n_host_result
(
m
,
n
))
+
static_cast
<
R
educe
AccDataType
>
(
bias_n
(
n
));
RAccDataType
acc
=
static_cast
<
RAccDataType
>
(
c_m_n_host_result
(
m
,
n
))
+
static_cast
<
RAccDataType
>
(
bias_n
(
n
));
R
educe
AccDataType
d0
=
static_cast
<
R
educe
AccDataType
>
(
d0_m_n
(
m
,
n
));
RAccDataType
d0
=
static_cast
<
RAccDataType
>
(
d0_m_n
(
m
,
n
));
c_element_op
(
acc
,
acc
);
d0_element_op
(
d0
,
d0
);
acc
+=
d0
;
...
...
@@ -196,14 +196,13 @@ void profile_gemm_bias_add_reduce_impl(int do_verification,
for
(
int
m
=
0
;
m
<
M
;
++
m
)
{
auto
reduce0_acc
=
reduce0_op
.
GetIdentityValue
<
R
educe
AccDataType
>
();
auto
reduce1_acc
=
reduce1_op
.
GetIdentityValue
<
R
educe
AccDataType
>
();
auto
reduce0_acc
=
reduce0_op
.
GetIdentityValue
<
RAccDataType
>
();
auto
reduce1_acc
=
reduce1_op
.
GetIdentityValue
<
RAccDataType
>
();
for
(
int
n
=
0
;
n
<
N
;
++
n
)
{
ReduceAccDataType
d0_val
=
ck
::
type_convert
<
ReduceAccDataType
>
(
c_m_n_host_result
(
m
,
n
));
ReduceAccDataType
d1_val
;
RAccDataType
d0_val
=
ck
::
type_convert
<
RAccDataType
>
(
c_m_n_host_result
(
m
,
n
));
RAccDataType
d1_val
;
square
(
d1_val
,
d0_val
);
reduce0_op
(
reduce0_acc
,
d0_val
);
...
...
@@ -212,8 +211,8 @@ void profile_gemm_bias_add_reduce_impl(int do_verification,
div
(
reduce0_acc
,
reduce0_acc
);
div
(
reduce1_acc
,
reduce1_acc
);
reduce0_m_host_result
(
m
)
=
ck
::
type_convert
<
R
educe
DataType
>
(
reduce0_acc
);
reduce1_m_host_result
(
m
)
=
ck
::
type_convert
<
R
educe
DataType
>
(
reduce1_acc
);
reduce0_m_host_result
(
m
)
=
ck
::
type_convert
<
RDataType
>
(
reduce0_acc
);
reduce1_m_host_result
(
m
)
=
ck
::
type_convert
<
RDataType
>
(
reduce1_acc
);
}
}
...
...
@@ -222,9 +221,9 @@ void profile_gemm_bias_add_reduce_impl(int do_verification,
DeviceMem
c_device_buf
(
sizeof
(
CDataType
)
*
c_m_n_device_result
.
mDesc
.
GetElementSpace
());
DeviceMem
bias_device_buf
(
sizeof
(
BiasDataType
)
*
bias_n
.
mDesc
.
GetElementSpace
());
DeviceMem
d0_device_buf
(
sizeof
(
D0DataType
)
*
d0_m_n
.
mDesc
.
GetElementSpace
());
DeviceMem
reduce0_device_buf
(
sizeof
(
R
educe
DataType
)
*
DeviceMem
reduce0_device_buf
(
sizeof
(
RDataType
)
*
reduce0_m_device_result
.
mDesc
.
GetElementSpace
());
DeviceMem
reduce1_device_buf
(
sizeof
(
R
educe
DataType
)
*
DeviceMem
reduce1_device_buf
(
sizeof
(
RDataType
)
*
reduce1_m_device_result
.
mDesc
.
GetElementSpace
());
std
::
array
<
void
*
,
2
>
p_reduces
=
{
reduce0_device_buf
.
GetDeviceBuffer
(),
...
...
@@ -323,8 +322,8 @@ void profile_gemm_bias_add_reduce_impl(int do_verification,
std
::
size_t
num_byte
=
sizeof
(
ADataType
)
*
M
*
K
+
sizeof
(
BDataType
)
*
K
*
N
+
sizeof
(
CDataType
)
*
M
*
N
+
sizeof
(
BiasDataType
)
*
M
*
N
+
sizeof
(
D0DataType
)
*
M
*
N
+
sizeof
(
R
educe
DataType
)
*
M
+
sizeof
(
R
educe
DataType
)
*
M
;
sizeof
(
D0DataType
)
*
M
*
N
+
sizeof
(
RDataType
)
*
M
+
sizeof
(
RDataType
)
*
M
;
float
tflops
=
static_cast
<
float
>
(
flop
)
/
1.E9
/
ave_time
;
...
...
profiler/include/profile_gemm_reduce_impl.hpp
View file @
66d93ae5
...
...
@@ -23,7 +23,7 @@ namespace instance {
using
F32
=
float
;
using
F16
=
ck
::
half_t
;
using
R
educe
PtrsGlobal
=
ck
::
Tuple
<
F32
*
,
F32
*>
;
using
RPtrsGlobal
=
ck
::
Tuple
<
F32
*
,
F32
*>
;
using
Div
=
ck
::
tensor_operation
::
element_wise
::
UnaryDivide
;
using
Identity
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
Square
=
ck
::
tensor_operation
::
element_wise
::
UnarySquare
;
...
...
@@ -31,7 +31,7 @@ using ReduceInElementOps = ck::Tuple<Identity, Square>;
using
ReduceOutElementOps
=
ck
::
Tuple
<
Div
,
Div
>
;
using
DeviceGemmReduceNoOpPtr
=
ck
::
tensor_operation
::
device
::
DeviceGemmReducePtr
<
0
,
R
educe
PtrsGlobal
::
Size
()
>
;
ck
::
tensor_operation
::
device
::
DeviceGemmReducePtr
<
0
,
RPtrsGlobal
::
Size
()
>
;
void
add_device_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_mk_kn_mn_instances
(
std
::
vector
<
DeviceGemmReduceNoOpPtr
>&
);
...
...
@@ -56,7 +56,7 @@ namespace profiler {
template
<
typename
ADataType
,
typename
BDataType
,
typename
CDataType
,
typename
R
educe
DataType
,
typename
RDataType
,
typename
ALayout
,
typename
BLayout
,
typename
CLayout
>
...
...
@@ -91,15 +91,15 @@ bool profile_gemm_reduce_impl(int do_verification,
Tensor
<
BDataType
>
b_k_n
(
f_host_tensor_descriptor
(
K
,
N
,
StrideB
,
BLayout
{}));
Tensor
<
CDataType
>
c_m_n_host_result
(
f_host_tensor_descriptor
(
M
,
N
,
StrideC
,
CLayout
{}));
Tensor
<
R
educe
DataType
>
reduce0_m_host_result
(
Tensor
<
RDataType
>
reduce0_m_host_result
(
HostTensorDescriptor
(
std
::
vector
<
std
::
size_t
>
({
static_cast
<
std
::
size_t
>
(
M
)})));
Tensor
<
R
educe
DataType
>
reduce1_m_host_result
(
Tensor
<
RDataType
>
reduce1_m_host_result
(
HostTensorDescriptor
(
std
::
vector
<
std
::
size_t
>
({
static_cast
<
std
::
size_t
>
(
M
)})));
Tensor
<
CDataType
>
c_m_n_device_result
(
f_host_tensor_descriptor
(
M
,
N
,
StrideC
,
CLayout
{}));
Tensor
<
R
educe
DataType
>
reduce0_m_device_result
(
Tensor
<
RDataType
>
reduce0_m_device_result
(
HostTensorDescriptor
(
std
::
vector
<
std
::
size_t
>
({
static_cast
<
std
::
size_t
>
(
M
)})));
Tensor
<
R
educe
DataType
>
reduce1_m_device_result
(
Tensor
<
RDataType
>
reduce1_m_device_result
(
HostTensorDescriptor
(
std
::
vector
<
std
::
size_t
>
({
static_cast
<
std
::
size_t
>
(
M
)})));
std
::
cout
<<
"a_m_k: "
<<
a_m_k
.
mDesc
<<
std
::
endl
;
...
...
@@ -151,12 +151,12 @@ bool profile_gemm_reduce_impl(int do_verification,
using
ReferenceGemmInstance
=
ck
::
tensor_operation
::
host
::
ReferenceGemm
<
ADataType
,
BDataType
,
CDataType
,
R
educe
DataType
,
RDataType
,
AElementOp
,
BElementOp
,
CElementOp
>
;
using
R
educe
AccDataType
=
R
educe
DataType
;
using
RAccDataType
=
RDataType
;
auto
ref_gemm
=
ReferenceGemmInstance
{};
auto
ref_invoker
=
ref_gemm
.
MakeInvoker
();
...
...
@@ -168,14 +168,13 @@ bool profile_gemm_reduce_impl(int do_verification,
for
(
int
m
=
0
;
m
<
M
;
++
m
)
{
auto
reduce0_acc
=
reduce0_op
.
GetIdentityValue
<
R
educe
AccDataType
>
();
auto
reduce1_acc
=
reduce1_op
.
GetIdentityValue
<
R
educe
AccDataType
>
();
auto
reduce0_acc
=
reduce0_op
.
GetIdentityValue
<
RAccDataType
>
();
auto
reduce1_acc
=
reduce1_op
.
GetIdentityValue
<
RAccDataType
>
();
for
(
int
n
=
0
;
n
<
N
;
++
n
)
{
ReduceAccDataType
d0_val
=
ck
::
type_convert
<
ReduceAccDataType
>
(
c_m_n_host_result
(
m
,
n
));
ReduceAccDataType
d1_val
;
RAccDataType
d0_val
=
ck
::
type_convert
<
RAccDataType
>
(
c_m_n_host_result
(
m
,
n
));
RAccDataType
d1_val
;
square
(
d1_val
,
d0_val
);
reduce0_op
(
reduce0_acc
,
d0_val
);
...
...
@@ -184,17 +183,17 @@ bool profile_gemm_reduce_impl(int do_verification,
div
(
reduce0_acc
,
reduce0_acc
);
div
(
reduce1_acc
,
reduce1_acc
);
reduce0_m_host_result
(
m
)
=
ck
::
type_convert
<
R
educe
DataType
>
(
reduce0_acc
);
reduce1_m_host_result
(
m
)
=
ck
::
type_convert
<
R
educe
DataType
>
(
reduce1_acc
);
reduce0_m_host_result
(
m
)
=
ck
::
type_convert
<
RDataType
>
(
reduce0_acc
);
reduce1_m_host_result
(
m
)
=
ck
::
type_convert
<
RDataType
>
(
reduce1_acc
);
}
}
DeviceMem
a_device_buf
(
sizeof
(
ADataType
)
*
a_m_k
.
mDesc
.
GetElementSpace
());
DeviceMem
b_device_buf
(
sizeof
(
BDataType
)
*
b_k_n
.
mDesc
.
GetElementSpace
());
DeviceMem
c_device_buf
(
sizeof
(
CDataType
)
*
c_m_n_device_result
.
mDesc
.
GetElementSpace
());
DeviceMem
reduce0_device_buf
(
sizeof
(
R
educe
DataType
)
*
DeviceMem
reduce0_device_buf
(
sizeof
(
RDataType
)
*
reduce0_m_device_result
.
mDesc
.
GetElementSpace
());
DeviceMem
reduce1_device_buf
(
sizeof
(
R
educe
DataType
)
*
DeviceMem
reduce1_device_buf
(
sizeof
(
RDataType
)
*
reduce1_m_device_result
.
mDesc
.
GetElementSpace
());
std
::
array
<
void
*
,
2
>
p_reduces
=
{
reduce0_device_buf
.
GetDeviceBuffer
(),
...
...
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment