Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
e4e99a49
Commit
e4e99a49
authored
Sep 22, 2022
by
Po-Yen, Chen
Browse files
Use new utilities to shorten codes
parent
7acbf104
Changes
144
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
450 additions
and
614 deletions
+450
-614
example/01_gemm/gemm_xdl_skip_b_lds_fp16.cpp
example/01_gemm/gemm_xdl_skip_b_lds_fp16.cpp
+20
-20
example/01_gemm/run_gemm_example.inc
example/01_gemm/run_gemm_example.inc
+36
-46
example/02_gemm_bilinear/gemm_bilinear_xdl_fp16.cpp
example/02_gemm_bilinear/gemm_bilinear_xdl_fp16.cpp
+42
-42
example/03_gemm_bias_relu/gemm_bias_relu_xdl_fp16.cpp
example/03_gemm_bias_relu/gemm_bias_relu_xdl_fp16.cpp
+40
-39
example/04_gemm_add_add_fastgelu/run_gemm_add_add_fastgelu_example.inc
...mm_add_add_fastgelu/run_gemm_add_add_fastgelu_example.inc
+21
-21
example/09_convnd_fwd/convnd_fwd_common.hpp
example/09_convnd_fwd/convnd_fwd_common.hpp
+30
-51
example/10_convnd_fwd_multiple_d_multiple_reduce/common.hpp
example/10_convnd_fwd_multiple_d_multiple_reduce/common.hpp
+4
-11
example/10_convnd_fwd_multiple_d_multiple_reduce/run_convnd_fwd_max_example.inc
...multiple_d_multiple_reduce/run_convnd_fwd_max_example.inc
+28
-40
example/12_reduce/reduce_blockwise_impl.hpp
example/12_reduce/reduce_blockwise_impl.hpp
+34
-39
example/12_reduce/reduce_blockwise_two_call.cpp
example/12_reduce/reduce_blockwise_two_call.cpp
+33
-41
example/12_reduce/reduce_multiblock_atomic_add_impl.hpp
example/12_reduce/reduce_multiblock_atomic_add_impl.hpp
+25
-35
example/13_pool2d_fwd/pool2d_fwd_common.hpp
example/13_pool2d_fwd/pool2d_fwd_common.hpp
+44
-45
example/14_gemm_xdl_requant_relu_requant/gemm_xdl_requant_relu_requant_int8.cpp
...quant_relu_requant/gemm_xdl_requant_relu_requant_int8.cpp
+22
-21
example/15_grouped_gemm/common.hpp
example/15_grouped_gemm/common.hpp
+34
-0
example/15_grouped_gemm/grouped_gemm_xdl_bfp16.cpp
example/15_grouped_gemm/grouped_gemm_xdl_bfp16.cpp
+1
-27
example/15_grouped_gemm/grouped_gemm_xdl_fp16.cpp
example/15_grouped_gemm/grouped_gemm_xdl_fp16.cpp
+1
-27
example/15_grouped_gemm/grouped_gemm_xdl_fp32.cpp
example/15_grouped_gemm/grouped_gemm_xdl_fp32.cpp
+1
-27
example/15_grouped_gemm/grouped_gemm_xdl_int4.cpp
example/15_grouped_gemm/grouped_gemm_xdl_int4.cpp
+1
-24
example/15_grouped_gemm/grouped_gemm_xdl_int8.cpp
example/15_grouped_gemm/grouped_gemm_xdl_int8.cpp
+1
-24
example/15_grouped_gemm/run_grouped_gemm_example.inc
example/15_grouped_gemm/run_grouped_gemm_example.inc
+32
-34
No files found.
example/01_gemm/gemm_xdl_skip_b_lds_fp16.cpp
View file @
e4e99a49
...
@@ -63,10 +63,10 @@ template <typename DataType>
...
@@ -63,10 +63,10 @@ template <typename DataType>
std
::
ostream
&
show_2d_matrix
(
std
::
ostream
&
os
,
Tensor
<
DataType
>&
matrix
)
std
::
ostream
&
show_2d_matrix
(
std
::
ostream
&
os
,
Tensor
<
DataType
>&
matrix
)
{
{
os
<<
"["
<<
std
::
endl
;
os
<<
"["
<<
std
::
endl
;
for
(
size_t
x
=
0
;
x
<
matrix
.
mDesc
.
GetLengths
()[
0
];
x
++
)
for
(
size_t
x
=
0
;
x
<
matrix
.
GetLengths
()[
0
];
x
++
)
{
{
os
<<
"["
;
os
<<
"["
;
for
(
size_t
y
=
0
;
y
<
matrix
.
mDesc
.
GetLengths
()[
1
];
y
++
)
for
(
size_t
y
=
0
;
y
<
matrix
.
GetLengths
()[
1
];
y
++
)
{
{
os
<<
std
::
setw
(
5
)
<<
static_cast
<
float
>
(
matrix
(
x
,
y
));
os
<<
std
::
setw
(
5
)
<<
static_cast
<
float
>
(
matrix
(
x
,
y
));
}
}
...
@@ -133,17 +133,17 @@ int main(int argc, char* argv[])
...
@@ -133,17 +133,17 @@ int main(int argc, char* argv[])
exit
(
0
);
exit
(
0
);
}
}
using
namespace
ck
::
literals
;
auto
f_host_tensor_descriptor
=
auto
f_host_tensor_descriptor
=
[](
std
::
size_t
row
,
std
::
size_t
col
,
std
::
size_t
stride
,
auto
layout
)
{
[](
std
::
size_t
row
,
std
::
size_t
col
,
std
::
size_t
stride
,
auto
layout
)
{
if
(
std
::
is_same
<
decltype
(
layout
),
ck
::
tensor_layout
::
gemm
::
RowMajor
>
::
value
)
if
constexpr
(
std
::
is_same
_v
<
decltype
(
layout
),
ck
::
tensor_layout
::
gemm
::
RowMajor
>
)
{
{
return
HostTensorDescriptor
(
std
::
vector
<
std
::
size_t
>
({
row
,
col
}),
return
HostTensorDescriptor
({
row
,
col
},
{
stride
,
1
_uz
});
std
::
vector
<
std
::
size_t
>
({
stride
,
1
}));
}
}
else
else
{
{
return
HostTensorDescriptor
(
std
::
vector
<
std
::
size_t
>
({
row
,
col
}),
return
HostTensorDescriptor
({
row
,
col
},
{
1
_uz
,
stride
});
std
::
vector
<
std
::
size_t
>
({
1
,
stride
}));
}
}
};
};
...
@@ -152,9 +152,9 @@ int main(int argc, char* argv[])
...
@@ -152,9 +152,9 @@ int main(int argc, char* argv[])
Tensor
<
CDataType
>
c_m_n_host_result
(
f_host_tensor_descriptor
(
M
,
N
,
StrideC
,
CLayout
{}));
Tensor
<
CDataType
>
c_m_n_host_result
(
f_host_tensor_descriptor
(
M
,
N
,
StrideC
,
CLayout
{}));
Tensor
<
CDataType
>
c_m_n_device_result
(
f_host_tensor_descriptor
(
M
,
N
,
StrideC
,
CLayout
{}));
Tensor
<
CDataType
>
c_m_n_device_result
(
f_host_tensor_descriptor
(
M
,
N
,
StrideC
,
CLayout
{}));
std
::
cout
<<
"a_m_k: "
<<
a_m_k
.
m
Desc
<<
std
::
endl
;
std
::
cout
<<
"a_m_k: "
<<
a_m_k
.
Get
Desc
()
<<
std
::
endl
;
std
::
cout
<<
"b_k_n: "
<<
b_k_n
.
m
Desc
<<
std
::
endl
;
std
::
cout
<<
"b_k_n: "
<<
b_k_n
.
Get
Desc
()
<<
std
::
endl
;
std
::
cout
<<
"c_m_n: "
<<
c_m_n_host_result
.
m
Desc
<<
std
::
endl
;
std
::
cout
<<
"c_m_n: "
<<
c_m_n_host_result
.
Get
Desc
()
<<
std
::
endl
;
switch
(
init_method
)
switch
(
init_method
)
{
{
...
@@ -173,12 +173,12 @@ int main(int argc, char* argv[])
...
@@ -173,12 +173,12 @@ int main(int argc, char* argv[])
b_k_n
.
GenerateTensorValue
(
GeneratorTensor_1
<
ADataType
>
{
1
});
b_k_n
.
GenerateTensorValue
(
GeneratorTensor_1
<
ADataType
>
{
1
});
}
}
DeviceMem
a_m_k_device_buf
(
sizeof
(
ADataType
)
*
a_m_k
.
mDesc
.
GetElementSpace
Size
());
DeviceMem
a_m_k_device_buf
(
a_m_k
.
GetMemory
Size
());
DeviceMem
b_k_n_device_buf
(
sizeof
(
BDataType
)
*
b_k_n
.
mDesc
.
GetElementSpace
Size
());
DeviceMem
b_k_n_device_buf
(
b_k_n
.
GetMemory
Size
());
DeviceMem
c_m_n_device_buf
(
sizeof
(
CDataType
)
*
c_m_n_device_result
.
mDesc
.
GetElementSpace
Size
());
DeviceMem
c_m_n_device_buf
(
c_m_n_device_result
.
GetMemory
Size
());
a_m_k_device_buf
.
ToDevice
(
a_m_k
.
mData
.
data
());
a_m_k_device_buf
.
ToDevice
(
a_m_k
.
data
());
b_k_n_device_buf
.
ToDevice
(
b_k_n
.
mData
.
data
());
b_k_n_device_buf
.
ToDevice
(
b_k_n
.
data
());
auto
a_element_op
=
AElementOp
{};
auto
a_element_op
=
AElementOp
{};
auto
b_element_op
=
BElementOp
{};
auto
b_element_op
=
BElementOp
{};
...
@@ -187,9 +187,9 @@ int main(int argc, char* argv[])
...
@@ -187,9 +187,9 @@ int main(int argc, char* argv[])
// do GEMM
// do GEMM
auto
gemm
=
DeviceGemmInstance
{};
auto
gemm
=
DeviceGemmInstance
{};
auto
invoker
=
gemm
.
MakeInvoker
();
auto
invoker
=
gemm
.
MakeInvoker
();
auto
argument
=
gemm
.
MakeArgument
(
static_cast
<
ADataType
*>
(
a_m_k_device_buf
.
GetDeviceBuffer
()
)
,
auto
argument
=
gemm
.
MakeArgument
(
a_m_k_device_buf
.
GetDeviceBuffer
(),
static_cast
<
BDataType
*>
(
b_k_n_device_buf
.
GetDeviceBuffer
()
)
,
b_k_n_device_buf
.
GetDeviceBuffer
(),
static_cast
<
CDataType
*>
(
c_m_n_device_buf
.
GetDeviceBuffer
()
)
,
c_m_n_device_buf
.
GetDeviceBuffer
(),
M
,
M
,
N
,
N
,
K
,
K
,
...
@@ -220,7 +220,7 @@ int main(int argc, char* argv[])
...
@@ -220,7 +220,7 @@ int main(int argc, char* argv[])
std
::
cout
<<
"Perf: "
<<
ave_time
<<
" ms, "
<<
tflops
<<
" TFlops, "
<<
gb_per_sec
<<
" GB/s, "
std
::
cout
<<
"Perf: "
<<
ave_time
<<
" ms, "
<<
tflops
<<
" TFlops, "
<<
gb_per_sec
<<
" GB/s, "
<<
gemm
.
GetTypeString
()
<<
std
::
endl
;
<<
gemm
.
GetTypeString
()
<<
std
::
endl
;
c_m_n_device_buf
.
FromDevice
(
c_m_n_device_result
.
mData
.
data
());
c_m_n_device_buf
.
FromDevice
(
c_m_n_device_result
.
data
());
if
(
do_verification
)
if
(
do_verification
)
{
{
...
@@ -240,7 +240,7 @@ int main(int argc, char* argv[])
...
@@ -240,7 +240,7 @@ int main(int argc, char* argv[])
show_2d_matrix(std::cout << "c_host :", c_m_n_host_result) << std::endl;
show_2d_matrix(std::cout << "c_host :", c_m_n_host_result) << std::endl;
}
}
#endif
#endif
ck
::
utils
::
check_err
(
c_m_n_device_result
.
mData
,
c_m_n_host_result
.
mData
);
ck
::
utils
::
check_err
(
c_m_n_device_result
,
c_m_n_host_result
);
}
}
return
0
;
return
0
;
...
...
example/01_gemm/run_gemm_example.inc
View file @
e4e99a49
...
@@ -9,10 +9,10 @@ bool run_gemm(const ProblemSize& problem_size, const ExecutionConfig& config)
...
@@ -9,10 +9,10 @@ bool run_gemm(const ProblemSize& problem_size, const ExecutionConfig& config)
static_assert
(
sizeof
(
ck
::
int4_t
)
==
sizeof
(
int8_t
));
static_assert
(
sizeof
(
ck
::
int4_t
)
==
sizeof
(
int8_t
));
#endif
#endif
using
namespace
ck
::
literals
;
auto
&
[
M
,
N
,
K
,
StrideA
,
StrideB
,
StrideC
]
=
problem_size
;
auto
&
[
M
,
N
,
K
,
StrideA
,
StrideB
,
StrideC
]
=
problem_size
;
using
namespace
ck
::
literals
;
auto
f_host_tensor_descriptor
=
auto
f_host_tensor_descriptor
=
[](
std
::
size_t
row
,
std
::
size_t
col
,
std
::
size_t
stride
,
auto
layout
)
{
[](
std
::
size_t
row
,
std
::
size_t
col
,
std
::
size_t
stride
,
auto
layout
)
{
if
constexpr
(
std
::
is_same_v
<
decltype
(
layout
),
ck
::
tensor_layout
::
gemm
::
RowMajor
>
)
if
constexpr
(
std
::
is_same_v
<
decltype
(
layout
),
ck
::
tensor_layout
::
gemm
::
RowMajor
>
)
...
@@ -32,41 +32,38 @@ bool run_gemm(const ProblemSize& problem_size, const ExecutionConfig& config)
...
@@ -32,41 +32,38 @@ bool run_gemm(const ProblemSize& problem_size, const ExecutionConfig& config)
{
{
case
0
:
break
;
case
0
:
break
;
case
1
:
case
1
:
ck
::
utils
::
FillUniformDistributionIntegerValue
<
ADataType
>
{
-
5.
f
,
5.
f
}(
a_m_k
.
begin
(),
ck
::
utils
::
FillUniformDistributionIntegerValue
<
ADataType
>
{
-
5.
f
,
5.
f
}(
a_m_k
);
a_m_k
.
end
());
ck
::
utils
::
FillUniformDistributionIntegerValue
<
BDataType
>
{
-
5.
f
,
5.
f
}(
b_k_n
);
ck
::
utils
::
FillUniformDistributionIntegerValue
<
BDataType
>
{
-
5.
f
,
5.
f
}(
b_k_n
.
begin
(),
b_k_n
.
end
());
break
;
break
;
default
:
default
:
ck
::
utils
::
FillUniformDistribution
<
ADataType
>
{
-
1.
f
,
1.
f
}(
a_m_k
.
begin
(),
a_m_k
.
end
()
);
ck
::
utils
::
FillUniformDistribution
<
ADataType
>
{
-
1.
f
,
1.
f
}(
a_m_k
);
ck
::
utils
::
FillUniformDistribution
<
BDataType
>
{
-
1.
f
,
1.
f
}(
b_k_n
.
begin
(),
b_k_n
.
end
()
);
ck
::
utils
::
FillUniformDistribution
<
BDataType
>
{
-
1.
f
,
1.
f
}(
b_k_n
);
}
}
Tensor
<
CDataType
>
c_m_n_host_result
(
f_host_tensor_descriptor
(
M
,
N
,
StrideC
,
CLayout
{}));
Tensor
<
CDataType
>
c_m_n_host_result
(
f_host_tensor_descriptor
(
M
,
N
,
StrideC
,
CLayout
{}));
Tensor
<
CDataType
>
c_m_n_device_result
(
f_host_tensor_descriptor
(
M
,
N
,
StrideC
,
CLayout
{}));
Tensor
<
CDataType
>
c_m_n_device_result
(
f_host_tensor_descriptor
(
M
,
N
,
StrideC
,
CLayout
{}));
std
::
cout
<<
"a_m_k: "
<<
a_m_k
.
m
Desc
<<
std
::
endl
;
std
::
cout
<<
"a_m_k: "
<<
a_m_k
.
Get
Desc
()
<<
std
::
endl
;
std
::
cout
<<
"b_k_n: "
<<
b_k_n
.
m
Desc
<<
std
::
endl
;
std
::
cout
<<
"b_k_n: "
<<
b_k_n
.
Get
Desc
()
<<
std
::
endl
;
std
::
cout
<<
"c_m_n: "
<<
c_m_n_host_result
.
m
Desc
<<
std
::
endl
;
std
::
cout
<<
"c_m_n: "
<<
c_m_n_host_result
.
Get
Desc
()
<<
std
::
endl
;
#ifdef BUILD_INT4_EXAMPLE
#ifdef BUILD_INT4_EXAMPLE
DeviceMem
a_m_k_device_buf
(
sizeof
(
KernelADataType
)
*
a_m_k
.
mDesc
.
GetElementSpaceSize
());
DeviceMem
a_m_k_device_buf
(
a_m_k
.
GetMemorySize
());
DeviceMem
b_k_n_device_buf
(
sizeof
(
KernelBDataType
)
*
b_k_n
.
mDesc
.
GetElementSpaceSize
());
DeviceMem
b_k_n_device_buf
(
b_k_n
.
GetMemorySize
());
DeviceMem
c_m_n_device_buf
(
sizeof
(
KernelCDataType
)
*
DeviceMem
c_m_n_device_buf
(
c_m_n_device_result
.
GetMemorySize
());
c_m_n_device_result
.
mDesc
.
GetElementSpaceSize
());
const
Tensor
<
KernelADataType
>
a_m_k_converted
(
a_m_k
);
const
Tensor
<
KernelADataType
>
a_m_k_converted
(
a_m_k
);
const
Tensor
<
KernelBDataType
>
b_k_n_converted
(
b_k_n
);
const
Tensor
<
KernelBDataType
>
b_k_n_converted
(
b_k_n
);
a_m_k_device_buf
.
ToDevice
(
a_m_k_converted
.
mData
.
data
());
a_m_k_device_buf
.
ToDevice
(
a_m_k_converted
.
data
());
b_k_n_device_buf
.
ToDevice
(
b_k_n_converted
.
mData
.
data
());
b_k_n_device_buf
.
ToDevice
(
b_k_n_converted
.
data
());
#else
#else
DeviceMem
a_m_k_device_buf
(
sizeof
(
ADataType
)
*
a_m_k
.
mDesc
.
GetElementSpace
Size
());
DeviceMem
a_m_k_device_buf
(
a_m_k
.
GetMemory
Size
());
DeviceMem
b_k_n_device_buf
(
sizeof
(
BDataType
)
*
b_k_n
.
mDesc
.
GetElementSpace
Size
());
DeviceMem
b_k_n_device_buf
(
b_k_n
.
GetMemory
Size
());
DeviceMem
c_m_n_device_buf
(
sizeof
(
CDataType
)
*
c_m_n_device_result
.
mDesc
.
GetElementSpace
Size
());
DeviceMem
c_m_n_device_buf
(
c_m_n_device_result
.
GetMemory
Size
());
a_m_k_device_buf
.
ToDevice
(
a_m_k
.
mData
.
data
());
a_m_k_device_buf
.
ToDevice
(
a_m_k
.
data
());
b_k_n_device_buf
.
ToDevice
(
b_k_n
.
mData
.
data
());
b_k_n_device_buf
.
ToDevice
(
b_k_n
.
data
());
#endif
#endif
auto
a_element_op
=
AElementOp
{};
auto
a_element_op
=
AElementOp
{};
...
@@ -76,16 +73,9 @@ bool run_gemm(const ProblemSize& problem_size, const ExecutionConfig& config)
...
@@ -76,16 +73,9 @@ bool run_gemm(const ProblemSize& problem_size, const ExecutionConfig& config)
// do GEMM
// do GEMM
auto
gemm
=
DeviceGemmInstance
{};
auto
gemm
=
DeviceGemmInstance
{};
auto
invoker
=
gemm
.
MakeInvoker
();
auto
invoker
=
gemm
.
MakeInvoker
();
auto
argument
=
gemm
.
MakeArgument
(
auto
argument
=
gemm
.
MakeArgument
(
a_m_k_device_buf
.
GetDeviceBuffer
(),
#ifdef BUILD_INT4_EXAMPLE
b_k_n_device_buf
.
GetDeviceBuffer
(),
static_cast
<
KernelADataType
*>
(
a_m_k_device_buf
.
GetDeviceBuffer
()),
c_m_n_device_buf
.
GetDeviceBuffer
(),
static_cast
<
KernelBDataType
*>
(
b_k_n_device_buf
.
GetDeviceBuffer
()),
static_cast
<
KernelCDataType
*>
(
c_m_n_device_buf
.
GetDeviceBuffer
()),
#else
static_cast
<
ADataType
*>
(
a_m_k_device_buf
.
GetDeviceBuffer
()),
static_cast
<
BDataType
*>
(
b_k_n_device_buf
.
GetDeviceBuffer
()),
static_cast
<
CDataType
*>
(
c_m_n_device_buf
.
GetDeviceBuffer
()),
#endif
M
,
M
,
N
,
N
,
K
,
K
,
...
@@ -127,17 +117,17 @@ bool run_gemm(const ProblemSize& problem_size, const ExecutionConfig& config)
...
@@ -127,17 +117,17 @@ bool run_gemm(const ProblemSize& problem_size, const ExecutionConfig& config)
ref_invoker
.
Run
(
ref_argument
);
ref_invoker
.
Run
(
ref_argument
);
#ifdef BUILD_INT4_EXAMPLE
#ifdef BUILD_INT4_EXAMPLE
Tensor
<
CDataType
>
c_m_n_device_result_converted
(
c_m_n_host_result
.
m
Desc
);
Tensor
<
CDataType
>
c_m_n_device_result_converted
(
c_m_n_host_result
.
Get
Desc
()
);
c_m_n_device_buf
.
FromDevice
(
c_m_n_device_result_converted
.
mData
.
data
());
c_m_n_device_buf
.
FromDevice
(
c_m_n_device_result_converted
.
data
());
c_m_n_device_result
=
c_m_n_device_result_converted
.
CopyAsType
<
CDataType
>
();
c_m_n_device_result
=
c_m_n_device_result_converted
.
CopyAsType
<
CDataType
>
();
return
ck
::
utils
::
check_err
(
c_m_n_device_result_converted
.
mData
,
c_m_n_host_result
.
mData
);
return
ck
::
utils
::
check_err
(
c_m_n_device_result_converted
,
c_m_n_host_result
);
#else
#else
c_m_n_device_buf
.
FromDevice
(
c_m_n_device_result
.
mData
.
data
());
c_m_n_device_buf
.
FromDevice
(
c_m_n_device_result
.
data
());
return
ck
::
utils
::
check_err
(
c_m_n_device_result
.
mData
,
c_m_n_host_result
.
mData
);
return
ck
::
utils
::
check_err
(
c_m_n_device_result
,
c_m_n_host_result
);
#endif
#endif
}
}
...
...
example/02_gemm_bilinear/gemm_bilinear_xdl_fp16.cpp
View file @
e4e99a49
// SPDX-License-Identifier: MIT
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <cstdlib>
#include <initializer_list>
#include <iostream>
#include <iostream>
#include <numeric>
#include <numeric>
#include <initializer_list>
#include <cstdlib>
#include "ck/ck.hpp"
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "ck/tensor_operation/gpu/device/device_gemm_multiple_d_xdl_cshuffle.hpp"
#include "ck/tensor_operation/gpu/device/device_gemm_multiple_d_xdl_cshuffle.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
#include "ck/library/utility/array.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
#include "ck/library/utility/literals.hpp"
#include "ck/library/utility/check_err.hpp"
struct
AlphaBetaAdd
struct
AlphaBetaAdd
{
{
...
@@ -175,17 +177,17 @@ int main(int argc, char* argv[])
...
@@ -175,17 +177,17 @@ int main(int argc, char* argv[])
exit
(
0
);
exit
(
0
);
}
}
using
namespace
ck
::
literals
;
auto
f_host_tensor_descriptor
=
auto
f_host_tensor_descriptor
=
[](
std
::
size_t
row
,
std
::
size_t
col
,
std
::
size_t
stride
,
auto
layout
)
{
[](
std
::
size_t
row
,
std
::
size_t
col
,
std
::
size_t
stride
,
auto
layout
)
{
if
(
std
::
is_same
<
decltype
(
layout
),
ck
::
tensor_layout
::
gemm
::
RowMajor
>
::
value
)
if
constexpr
(
std
::
is_same
_v
<
decltype
(
layout
),
ck
::
tensor_layout
::
gemm
::
RowMajor
>
)
{
{
return
HostTensorDescriptor
(
std
::
vector
<
std
::
size_t
>
({
row
,
col
}),
return
HostTensorDescriptor
({
row
,
col
},
{
stride
,
1
_uz
});
std
::
vector
<
std
::
size_t
>
({
stride
,
1
}));
}
}
else
else
{
{
return
HostTensorDescriptor
(
std
::
vector
<
std
::
size_t
>
({
row
,
col
}),
return
HostTensorDescriptor
({
row
,
col
},
{
1
_uz
,
stride
});
std
::
vector
<
std
::
size_t
>
({
1
,
stride
}));
}
}
};
};
...
@@ -195,10 +197,10 @@ int main(int argc, char* argv[])
...
@@ -195,10 +197,10 @@ int main(int argc, char* argv[])
Tensor
<
EDataType
>
e_m_n_host_result
(
f_host_tensor_descriptor
(
M
,
N
,
StrideE
,
ELayout
{}));
Tensor
<
EDataType
>
e_m_n_host_result
(
f_host_tensor_descriptor
(
M
,
N
,
StrideE
,
ELayout
{}));
Tensor
<
EDataType
>
e_m_n_device_result
(
f_host_tensor_descriptor
(
M
,
N
,
StrideE
,
ELayout
{}));
Tensor
<
EDataType
>
e_m_n_device_result
(
f_host_tensor_descriptor
(
M
,
N
,
StrideE
,
ELayout
{}));
std
::
cout
<<
"a_m_k: "
<<
a_m_k
.
m
Desc
<<
std
::
endl
;
std
::
cout
<<
"a_m_k: "
<<
a_m_k
.
Get
Desc
()
<<
std
::
endl
;
std
::
cout
<<
"b_k_n: "
<<
b_k_n
.
m
Desc
<<
std
::
endl
;
std
::
cout
<<
"b_k_n: "
<<
b_k_n
.
Get
Desc
()
<<
std
::
endl
;
std
::
cout
<<
"d_m_n: "
<<
d_m_n
.
m
Desc
<<
std
::
endl
;
std
::
cout
<<
"d_m_n: "
<<
d_m_n
.
Get
Desc
()
<<
std
::
endl
;
std
::
cout
<<
"e_m_n: "
<<
e_m_n_host_result
.
m
Desc
<<
std
::
endl
;
std
::
cout
<<
"e_m_n: "
<<
e_m_n_host_result
.
Get
Desc
()
<<
std
::
endl
;
switch
(
init_method
)
switch
(
init_method
)
{
{
...
@@ -214,15 +216,15 @@ int main(int argc, char* argv[])
...
@@ -214,15 +216,15 @@ int main(int argc, char* argv[])
d_m_n
.
GenerateTensorValue
(
GeneratorTensor_3
<
DDataType
>
{
-
0.5
,
0.5
});
d_m_n
.
GenerateTensorValue
(
GeneratorTensor_3
<
DDataType
>
{
-
0.5
,
0.5
});
}
}
DeviceMem
a_device_buf
(
sizeof
(
ADataType
)
*
a_m_k
.
mDesc
.
GetElementSpace
Size
());
DeviceMem
a_device_buf
(
a_m_k
.
GetMemory
Size
());
DeviceMem
b_device_buf
(
sizeof
(
BDataType
)
*
b_k_n
.
mDesc
.
GetElementSpace
Size
());
DeviceMem
b_device_buf
(
b_k_n
.
GetMemory
Size
());
DeviceMem
d_device_buf
(
sizeof
(
DDataType
)
*
d_m_n
.
mDesc
.
GetElementSpace
Size
());
DeviceMem
d_device_buf
(
d_m_n
.
GetMemory
Size
());
DeviceMem
e_device_buf
(
sizeof
(
EDataType
)
*
e_m_n_device_result
.
mDesc
.
GetElementSpace
Size
());
DeviceMem
e_device_buf
(
e_m_n_device_result
.
GetMemory
Size
());
a_device_buf
.
ToDevice
(
a_m_k
.
mData
.
data
());
a_device_buf
.
ToDevice
(
a_m_k
.
data
());
b_device_buf
.
ToDevice
(
b_k_n
.
mData
.
data
());
b_device_buf
.
ToDevice
(
b_k_n
.
data
());
d_device_buf
.
ToDevice
(
d_m_n
.
mData
.
data
());
d_device_buf
.
ToDevice
(
d_m_n
.
data
());
e_device_buf
.
ToDevice
(
e_m_n_device_result
.
mData
.
data
());
e_device_buf
.
ToDevice
(
e_m_n_device_result
.
data
());
auto
a_element_op
=
AElementOp
{};
auto
a_element_op
=
AElementOp
{};
auto
b_element_op
=
BElementOp
{};
auto
b_element_op
=
BElementOp
{};
...
@@ -231,17 +233,16 @@ int main(int argc, char* argv[])
...
@@ -231,17 +233,16 @@ int main(int argc, char* argv[])
// do GEMM
// do GEMM
auto
device_op
=
DeviceOpInstance
{};
auto
device_op
=
DeviceOpInstance
{};
auto
invoker
=
device_op
.
MakeInvoker
();
auto
invoker
=
device_op
.
MakeInvoker
();
auto
argument
=
auto
argument
=
device_op
.
MakeArgument
(
a_device_buf
.
GetDeviceBuffer
(),
device_op
.
MakeArgument
(
a_device_buf
.
GetDeviceBuffer
(),
b_device_buf
.
GetDeviceBuffer
(),
b_device_buf
.
GetDeviceBuffer
(),
std
::
array
<
const
void
*
,
1
>
{
d_device_buf
.
GetDeviceBuffer
()},
ck
::
utils
::
to_array
(
{
d_device_buf
.
GetDeviceBuffer
()}
)
,
e_device_buf
.
GetDeviceBuffer
(),
e_device_buf
.
GetDeviceBuffer
(),
M
,
M
,
N
,
N
,
K
,
K
,
StrideA
,
StrideA
,
StrideB
,
StrideB
,
std
::
array
<
ck
::
index_t
,
1
>
{
StrideD
},
ck
::
utils
::
to_array
(
{
StrideD
}
)
,
StrideE
,
StrideE
,
a_element_op
,
a_element_op
,
b_element_op
,
b_element_op
,
...
@@ -267,12 +268,11 @@ int main(int argc, char* argv[])
...
@@ -267,12 +268,11 @@ int main(int argc, char* argv[])
std
::
cout
<<
"Perf: "
<<
ave_time
<<
" ms, "
<<
tflops
<<
" TFlops, "
<<
gb_per_sec
<<
" GB/s"
std
::
cout
<<
"Perf: "
<<
ave_time
<<
" ms, "
<<
tflops
<<
" TFlops, "
<<
gb_per_sec
<<
" GB/s"
<<
std
::
endl
;
<<
std
::
endl
;
e_device_buf
.
FromDevice
(
e_m_n_device_result
.
mData
.
data
());
e_device_buf
.
FromDevice
(
e_m_n_device_result
.
data
());
if
(
do_verification
)
if
(
do_verification
)
{
{
Tensor
<
CShuffleDataType
>
c_m_n
(
HostTensorDescriptor
(
Tensor
<
CShuffleDataType
>
c_m_n
(
HostTensorDescriptor
({
M
,
N
}));
std
::
vector
<
std
::
size_t
>
{
static_cast
<
std
::
size_t
>
(
M
),
static_cast
<
std
::
size_t
>
(
N
)}));
using
ReferenceGemmInstance
=
ck
::
tensor_operation
::
host
::
ReferenceGemm
<
ADataType
,
using
ReferenceGemmInstance
=
ck
::
tensor_operation
::
host
::
ReferenceGemm
<
ADataType
,
BDataType
,
BDataType
,
...
@@ -297,9 +297,9 @@ int main(int argc, char* argv[])
...
@@ -297,9 +297,9 @@ int main(int argc, char* argv[])
}
}
}
}
e_device_buf
.
FromDevice
(
e_m_n_device_result
.
mData
.
data
());
e_device_buf
.
FromDevice
(
e_m_n_device_result
.
data
());
return
ck
::
utils
::
check_err
(
e_m_n_device_result
.
mData
,
e_m_n_host_result
.
mData
)
?
0
:
1
;
return
ck
::
utils
::
check_err
(
e_m_n_device_result
,
e_m_n_host_result
)
?
0
:
1
;
}
}
return
0
;
return
0
;
...
...
example/03_gemm_bias_relu/gemm_bias_relu_xdl_fp16.cpp
View file @
e4e99a49
// SPDX-License-Identifier: MIT
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <cstdlib>
#include <initializer_list>
#include <iostream>
#include <iostream>
#include <numeric>
#include <numeric>
#include <initializer_list>
#include <cstdlib>
#include "ck/ck.hpp"
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "ck/tensor_operation/gpu/device/device_gemm_multiple_d_xdl_cshuffle.hpp"
#include "ck/tensor_operation/gpu/device/device_gemm_multiple_d_xdl_cshuffle.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
#include "ck/library/utility/array.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
#include "ck/library/utility/literals.hpp"
#include "ck/library/utility/check_err.hpp"
template
<
ck
::
index_t
...
Is
>
template
<
ck
::
index_t
...
Is
>
using
S
=
ck
::
Sequence
<
Is
...
>
;
using
S
=
ck
::
Sequence
<
Is
...
>
;
...
@@ -153,17 +155,17 @@ int main(int argc, char* argv[])
...
@@ -153,17 +155,17 @@ int main(int argc, char* argv[])
exit
(
0
);
exit
(
0
);
}
}
using
namespace
ck
::
literals
;
auto
f_host_tensor_descriptor
=
auto
f_host_tensor_descriptor
=
[](
std
::
size_t
row
,
std
::
size_t
col
,
std
::
size_t
stride
,
auto
layout
)
{
[](
std
::
size_t
row
,
std
::
size_t
col
,
std
::
size_t
stride
,
auto
layout
)
{
if
(
std
::
is_same
<
decltype
(
layout
),
ck
::
tensor_layout
::
gemm
::
RowMajor
>
::
value
)
if
constexpr
(
std
::
is_same
_v
<
decltype
(
layout
),
ck
::
tensor_layout
::
gemm
::
RowMajor
>
)
{
{
return
HostTensorDescriptor
(
std
::
vector
<
std
::
size_t
>
({
row
,
col
}),
return
HostTensorDescriptor
({
row
,
col
},
{
stride
,
1
_uz
});
std
::
vector
<
std
::
size_t
>
({
stride
,
1
}));
}
}
else
else
{
{
return
HostTensorDescriptor
(
std
::
vector
<
std
::
size_t
>
({
row
,
col
}),
return
HostTensorDescriptor
({
row
,
col
},
{
1
_uz
,
stride
});
std
::
vector
<
std
::
size_t
>
({
1
,
stride
}));
}
}
};
};
...
@@ -173,10 +175,10 @@ int main(int argc, char* argv[])
...
@@ -173,10 +175,10 @@ int main(int argc, char* argv[])
Tensor
<
EDataType
>
e_m_n_host_result
(
f_host_tensor_descriptor
(
M
,
N
,
StrideE
,
ELayout
{}));
Tensor
<
EDataType
>
e_m_n_host_result
(
f_host_tensor_descriptor
(
M
,
N
,
StrideE
,
ELayout
{}));
Tensor
<
EDataType
>
e_m_n_device_result
(
f_host_tensor_descriptor
(
M
,
N
,
StrideE
,
ELayout
{}));
Tensor
<
EDataType
>
e_m_n_device_result
(
f_host_tensor_descriptor
(
M
,
N
,
StrideE
,
ELayout
{}));
std
::
cout
<<
"a_m_k: "
<<
a_m_k
.
m
Desc
<<
std
::
endl
;
std
::
cout
<<
"a_m_k: "
<<
a_m_k
.
Get
Desc
()
<<
std
::
endl
;
std
::
cout
<<
"b_k_n: "
<<
b_k_n
.
m
Desc
<<
std
::
endl
;
std
::
cout
<<
"b_k_n: "
<<
b_k_n
.
Get
Desc
()
<<
std
::
endl
;
std
::
cout
<<
"d_m_n: "
<<
d_m_n
.
m
Desc
<<
std
::
endl
;
std
::
cout
<<
"d_m_n: "
<<
d_m_n
.
Get
Desc
()
<<
std
::
endl
;
std
::
cout
<<
"e_m_n: "
<<
e_m_n_host_result
.
m
Desc
<<
std
::
endl
;
std
::
cout
<<
"e_m_n: "
<<
e_m_n_host_result
.
Get
Desc
()
<<
std
::
endl
;
switch
(
init_method
)
switch
(
init_method
)
{
{
...
@@ -192,14 +194,14 @@ int main(int argc, char* argv[])
...
@@ -192,14 +194,14 @@ int main(int argc, char* argv[])
d_m_n
.
GenerateTensorValue
(
GeneratorTensor_3
<
DDataType
>
{
0.0
,
1.0
});
d_m_n
.
GenerateTensorValue
(
GeneratorTensor_3
<
DDataType
>
{
0.0
,
1.0
});
}
}
DeviceMem
a_device_buf
(
sizeof
(
ADataType
)
*
a_m_k
.
mDesc
.
GetElementSpace
Size
());
DeviceMem
a_device_buf
(
a_m_k
.
GetMemory
Size
());
DeviceMem
b_device_buf
(
sizeof
(
BDataType
)
*
b_k_n
.
mDesc
.
GetElementSpace
Size
());
DeviceMem
b_device_buf
(
b_k_n
.
GetMemory
Size
());
DeviceMem
d_device_buf
(
sizeof
(
DDataType
)
*
d_m_n
.
mDesc
.
GetElementSpace
Size
());
DeviceMem
d_device_buf
(
d_m_n
.
GetMemory
Size
());
DeviceMem
e_device_buf
(
sizeof
(
EDataType
)
*
e_m_n_device_result
.
mDesc
.
GetElementSpace
Size
());
DeviceMem
e_device_buf
(
e_m_n_device_result
.
GetMemory
Size
());
a_device_buf
.
ToDevice
(
a_m_k
.
mData
.
data
());
a_device_buf
.
ToDevice
(
a_m_k
.
data
());
b_device_buf
.
ToDevice
(
b_k_n
.
mData
.
data
());
b_device_buf
.
ToDevice
(
b_k_n
.
data
());
d_device_buf
.
ToDevice
(
d_m_n
.
mData
.
data
());
d_device_buf
.
ToDevice
(
d_m_n
.
data
());
auto
a_element_op
=
AElementOp
{};
auto
a_element_op
=
AElementOp
{};
auto
b_element_op
=
BElementOp
{};
auto
b_element_op
=
BElementOp
{};
...
@@ -210,17 +212,16 @@ int main(int argc, char* argv[])
...
@@ -210,17 +212,16 @@ int main(int argc, char* argv[])
auto
invoker
=
device_op
.
MakeInvoker
();
auto
invoker
=
device_op
.
MakeInvoker
();
auto
argument
=
auto
argument
=
device_op
.
MakeArgument
(
a_device_buf
.
GetDeviceBuffer
(),
device_op
.
MakeArgument
(
a_device_buf
.
GetDeviceBuffer
(),
b_device_buf
.
GetDeviceBuffer
(),
b_device_buf
.
GetDeviceBuffer
(),
std
::
array
<
const
void
*
,
1
>
{
d_device_buf
.
GetDeviceBuffer
()},
ck
::
utils
::
to_array
(
{
d_device_buf
.
GetDeviceBuffer
()}
)
,
e_device_buf
.
GetDeviceBuffer
(),
e_device_buf
.
GetDeviceBuffer
(),
M
,
M
,
N
,
N
,
K
,
K
,
StrideA
,
StrideA
,
StrideB
,
StrideB
,
std
::
array
<
ck
::
index_t
,
1
>
{
0
},
ck
::
utils
::
to_array
(
{
0
}
)
,
StrideE
,
StrideE
,
a_element_op
,
a_element_op
,
b_element_op
,
b_element_op
,
...
@@ -247,7 +248,7 @@ int main(int argc, char* argv[])
...
@@ -247,7 +248,7 @@ int main(int argc, char* argv[])
if
(
do_verification
)
if
(
do_verification
)
{
{
e_device_buf
.
FromDevice
(
e_m_n_device_result
.
mData
.
data
());
e_device_buf
.
FromDevice
(
e_m_n_device_result
.
data
());
Tensor
<
AccDataType
>
c_m_n
(
f_host_tensor_descriptor
(
M
,
N
,
StrideE
,
ELayout
{}));
Tensor
<
AccDataType
>
c_m_n
(
f_host_tensor_descriptor
(
M
,
N
,
StrideE
,
ELayout
{}));
...
@@ -275,7 +276,7 @@ int main(int argc, char* argv[])
...
@@ -275,7 +276,7 @@ int main(int argc, char* argv[])
}
}
}
}
return
ck
::
utils
::
check_err
(
e_m_n_device_result
.
mData
,
e_m_n_host_result
.
mData
)
?
0
:
1
;
return
ck
::
utils
::
check_err
(
e_m_n_device_result
,
e_m_n_host_result
)
?
0
:
1
;
}
}
return
0
;
return
0
;
...
...
example/04_gemm_add_add_fastgelu/run_gemm_add_add_fastgelu_example.inc
View file @
e4e99a49
...
@@ -35,11 +35,11 @@ bool run_gemm_add_add_fastgelu(const ProblemSize& problem_size, const ExecutionC
...
@@ -35,11 +35,11 @@ bool run_gemm_add_add_fastgelu(const ProblemSize& problem_size, const ExecutionC
>
>
e_m_n_device_result
(
f_host_tensor_descriptor
(
M
,
N
,
StrideE
,
ELayout
{}));
e_m_n_device_result
(
f_host_tensor_descriptor
(
M
,
N
,
StrideE
,
ELayout
{}));
std
::
cout
<<
"a_m_k: "
<<
a_m_k
.
m
Desc
<<
std
::
endl
;
std
::
cout
<<
"a_m_k: "
<<
a_m_k
.
Get
Desc
()
<<
std
::
endl
;
std
::
cout
<<
"b_k_n: "
<<
b_k_n
.
m
Desc
<<
std
::
endl
;
std
::
cout
<<
"b_k_n: "
<<
b_k_n
.
Get
Desc
()
<<
std
::
endl
;
std
::
cout
<<
"d0_m_n: "
<<
d0_m_n
.
m
Desc
<<
std
::
endl
;
std
::
cout
<<
"d0_m_n: "
<<
d0_m_n
.
Get
Desc
()
<<
std
::
endl
;
std
::
cout
<<
"d1_m_n: "
<<
d1_m_n
.
m
Desc
<<
std
::
endl
;
std
::
cout
<<
"d1_m_n: "
<<
d1_m_n
.
Get
Desc
()
<<
std
::
endl
;
std
::
cout
<<
"e_m_n: "
<<
e_m_n_host_result
.
m
Desc
<<
std
::
endl
;
std
::
cout
<<
"e_m_n: "
<<
e_m_n_host_result
.
Get
Desc
()
<<
std
::
endl
;
switch
(
config
.
init_method
)
switch
(
config
.
init_method
)
{
{
...
@@ -57,11 +57,11 @@ bool run_gemm_add_add_fastgelu(const ProblemSize& problem_size, const ExecutionC
...
@@ -57,11 +57,11 @@ bool run_gemm_add_add_fastgelu(const ProblemSize& problem_size, const ExecutionC
d1_m_n
.
GenerateTensorValue
(
GeneratorTensor_3
<
D1DataType
>
{
0.0
,
1.0
});
d1_m_n
.
GenerateTensorValue
(
GeneratorTensor_3
<
D1DataType
>
{
0.0
,
1.0
});
}
}
DeviceMem
a_device_buf
(
sizeof
(
ADataType
)
*
a_m_k
.
mDesc
.
GetElementSpace
Size
());
DeviceMem
a_device_buf
(
a_m_k
.
GetMemory
Size
());
DeviceMem
b_device_buf
(
sizeof
(
BDataType
)
*
b_k_n
.
mDesc
.
GetElementSpace
Size
());
DeviceMem
b_device_buf
(
b_k_n
.
GetMemory
Size
());
DeviceMem
d0_device_buf
(
sizeof
(
D0DataType
)
*
d0_m_n
.
mDesc
.
GetElementSpace
Size
());
DeviceMem
d0_device_buf
(
d0_m_n
.
GetMemory
Size
());
DeviceMem
d1_device_buf
(
sizeof
(
D1DataType
)
*
d1_m_n
.
mDesc
.
GetElementSpace
Size
());
DeviceMem
d1_device_buf
(
d1_m_n
.
GetMemory
Size
());
DeviceMem
e_device_buf
(
sizeof
(
EDataType
)
*
e_m_n_device_result
.
mDesc
.
GetElementSpace
Size
());
DeviceMem
e_device_buf
(
e_m_n_device_result
.
GetMemory
Size
());
#ifdef BUILD_INT4_EXAMPLE
#ifdef BUILD_INT4_EXAMPLE
const
Tensor
<
KernelADataType
>
a_m_k_converted
(
a_m_k
);
const
Tensor
<
KernelADataType
>
a_m_k_converted
(
a_m_k
);
...
@@ -69,15 +69,15 @@ bool run_gemm_add_add_fastgelu(const ProblemSize& problem_size, const ExecutionC
...
@@ -69,15 +69,15 @@ bool run_gemm_add_add_fastgelu(const ProblemSize& problem_size, const ExecutionC
const
Tensor
<
KernelD0DataType
>
d0_m_n_converted
(
d0_m_n
);
const
Tensor
<
KernelD0DataType
>
d0_m_n_converted
(
d0_m_n
);
const
Tensor
<
KernelD1DataType
>
d1_m_n_converted
(
d1_m_n
);
const
Tensor
<
KernelD1DataType
>
d1_m_n_converted
(
d1_m_n
);
a_device_buf
.
ToDevice
(
a_m_k_converted
.
mData
.
data
());
a_device_buf
.
ToDevice
(
a_m_k_converted
.
data
());
b_device_buf
.
ToDevice
(
b_k_n_converted
.
mData
.
data
());
b_device_buf
.
ToDevice
(
b_k_n_converted
.
data
());
d0_device_buf
.
ToDevice
(
d0_m_n_converted
.
mData
.
data
());
d0_device_buf
.
ToDevice
(
d0_m_n_converted
.
data
());
d1_device_buf
.
ToDevice
(
d1_m_n_converted
.
mData
.
data
());
d1_device_buf
.
ToDevice
(
d1_m_n_converted
.
data
());
#else
#else
a_device_buf
.
ToDevice
(
a_m_k
.
mData
.
data
());
a_device_buf
.
ToDevice
(
a_m_k
.
data
());
b_device_buf
.
ToDevice
(
b_k_n
.
mData
.
data
());
b_device_buf
.
ToDevice
(
b_k_n
.
data
());
d0_device_buf
.
ToDevice
(
d0_m_n
.
mData
.
data
());
d0_device_buf
.
ToDevice
(
d0_m_n
.
data
());
d1_device_buf
.
ToDevice
(
d1_m_n
.
mData
.
data
());
d1_device_buf
.
ToDevice
(
d1_m_n
.
data
());
#endif
#endif
auto
a_element_op
=
AElementOp
{};
auto
a_element_op
=
AElementOp
{};
...
@@ -142,14 +142,14 @@ bool run_gemm_add_add_fastgelu(const ProblemSize& problem_size, const ExecutionC
...
@@ -142,14 +142,14 @@ bool run_gemm_add_add_fastgelu(const ProblemSize& problem_size, const ExecutionC
}
}
}
}
e_device_buf
.
FromDevice
(
e_m_n_device_result
.
mData
.
data
());
e_device_buf
.
FromDevice
(
e_m_n_device_result
.
data
());
#ifdef BUILD_INT4_EXAMPLE
#ifdef BUILD_INT4_EXAMPLE
const
Tensor
<
EDataType
>
e_m_n_device_result_converted
(
e_m_n_device_result
);
const
Tensor
<
EDataType
>
e_m_n_device_result_converted
(
e_m_n_device_result
);
return
ck
::
utils
::
check_err
(
e_m_n_device_result_converted
.
mData
,
e_m_n_host_result
.
mData
);
return
ck
::
utils
::
check_err
(
e_m_n_device_result_converted
,
e_m_n_host_result
);
#else
#else
return
ck
::
utils
::
check_err
(
e_m_n_device_result
.
mData
,
e_m_n_host_result
.
mData
);
return
ck
::
utils
::
check_err
(
e_m_n_device_result
,
e_m_n_host_result
);
#endif
#endif
}
}
...
...
example/09_convnd_fwd/convnd_fwd_common.hpp
View file @
e4e99a49
...
@@ -10,13 +10,14 @@
...
@@ -10,13 +10,14 @@
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp"
#include "ck/library/utility/array.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
#include "ck/library/utility/convolution_parameter.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/convolution_parameter.hpp"
#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp"
void
print_helper_msg
()
void
print_helper_msg
()
{
{
...
@@ -50,9 +51,9 @@ bool run_grouped_conv_fwd(bool do_verification,
...
@@ -50,9 +51,9 @@ bool run_grouped_conv_fwd(bool do_verification,
Tensor
<
OutDataType
>
out_host
(
out_g_n_k_wos_desc
);
Tensor
<
OutDataType
>
out_host
(
out_g_n_k_wos_desc
);
Tensor
<
OutDataType
>
out_device
(
out_g_n_k_wos_desc
);
Tensor
<
OutDataType
>
out_device
(
out_g_n_k_wos_desc
);
std
::
cout
<<
"in: "
<<
in
.
m
Desc
<<
std
::
endl
;
std
::
cout
<<
"in: "
<<
in
.
Get
Desc
()
<<
std
::
endl
;
std
::
cout
<<
"wei: "
<<
wei
.
m
Desc
<<
std
::
endl
;
std
::
cout
<<
"wei: "
<<
wei
.
Get
Desc
()
<<
std
::
endl
;
std
::
cout
<<
"out: "
<<
out_host
.
m
Desc
<<
std
::
endl
;
std
::
cout
<<
"out: "
<<
out_host
.
Get
Desc
()
<<
std
::
endl
;
switch
(
init_method
)
switch
(
init_method
)
{
{
...
@@ -66,56 +67,34 @@ bool run_grouped_conv_fwd(bool do_verification,
...
@@ -66,56 +67,34 @@ bool run_grouped_conv_fwd(bool do_verification,
wei
.
GenerateTensorValue
(
GeneratorTensor_3
<
WeiDataType
>
{
-
0.5
,
0.5
});
wei
.
GenerateTensorValue
(
GeneratorTensor_3
<
WeiDataType
>
{
-
0.5
,
0.5
});
}
}
DeviceMem
in_device_buf
(
sizeof
(
InDataType
)
*
in
.
mDesc
.
GetElementSpaceSize
());
DeviceMem
in_device_buf
(
in
.
GetMemorySize
());
DeviceMem
wei_device_buf
(
sizeof
(
WeiDataType
)
*
wei
.
mDesc
.
GetElementSpaceSize
());
DeviceMem
wei_device_buf
(
wei
.
GetMemorySize
());
DeviceMem
out_device_buf
(
sizeof
(
OutDataType
)
*
out_device
.
mDesc
.
GetElementSpaceSize
());
DeviceMem
out_device_buf
(
out_device
.
GetMemorySize
());
in_device_buf
.
ToDevice
(
in
.
mData
.
data
());
in_device_buf
.
ToDevice
(
in
.
data
());
wei_device_buf
.
ToDevice
(
wei
.
mData
.
data
());
wei_device_buf
.
ToDevice
(
wei
.
data
());
std
::
array
<
ck
::
index_t
,
NDimSpatial
+
3
>
a_g_n_c_wis_lengths
{};
using
ck
::
utils
::
empty_array
,
ck
::
utils
::
to_array
;
std
::
array
<
ck
::
index_t
,
NDimSpatial
+
3
>
a_g_n_c_wis_strides
{};
std
::
array
<
ck
::
index_t
,
NDimSpatial
+
3
>
b_g_k_c_xs_lengths
{};
std
::
array
<
ck
::
index_t
,
NDimSpatial
+
3
>
b_g_k_c_xs_strides
{};
std
::
array
<
ck
::
index_t
,
NDimSpatial
+
3
>
e_g_n_k_wos_lengths
{};
std
::
array
<
ck
::
index_t
,
NDimSpatial
+
3
>
e_g_n_k_wos_strides
{};
std
::
array
<
ck
::
index_t
,
NDimSpatial
>
conv_filter_strides
{};
std
::
array
<
ck
::
index_t
,
NDimSpatial
>
conv_filter_dilations
{};
std
::
array
<
ck
::
index_t
,
NDimSpatial
>
input_left_pads
{};
std
::
array
<
ck
::
index_t
,
NDimSpatial
>
input_right_pads
{};
auto
copy
=
[](
auto
&
x
,
auto
&
y
)
{
std
::
copy
(
x
.
begin
(),
x
.
end
(),
y
.
begin
());
};
copy
(
in_g_n_c_wis_desc
.
GetLengths
(),
a_g_n_c_wis_lengths
);
copy
(
in_g_n_c_wis_desc
.
GetStrides
(),
a_g_n_c_wis_strides
);
copy
(
wei_g_k_c_xs_desc
.
GetLengths
(),
b_g_k_c_xs_lengths
);
copy
(
wei_g_k_c_xs_desc
.
GetStrides
(),
b_g_k_c_xs_strides
);
copy
(
out_g_n_k_wos_desc
.
GetLengths
(),
e_g_n_k_wos_lengths
);
copy
(
out_g_n_k_wos_desc
.
GetStrides
(),
e_g_n_k_wos_strides
);
copy
(
conv_param
.
conv_filter_strides_
,
conv_filter_strides
);
copy
(
conv_param
.
conv_filter_dilations_
,
conv_filter_dilations
);
copy
(
conv_param
.
input_left_pads_
,
input_left_pads
);
copy
(
conv_param
.
input_right_pads_
,
input_right_pads
);
// do Conv
// do Conv
auto
conv
=
DeviceConvNDFwdInstance
{};
auto
conv
=
DeviceConvNDFwdInstance
{};
auto
invoker
=
conv
.
MakeInvoker
();
auto
invoker
=
conv
.
MakeInvoker
();
auto
argument
=
conv
.
MakeArgument
(
in_device_buf
.
GetDeviceBuffer
(),
auto
argument
=
conv
.
MakeArgument
(
in_device_buf
.
GetDeviceBuffer
(),
wei_device_buf
.
GetDeviceBuffer
(),
wei_device_buf
.
GetDeviceBuffer
(),
std
::
array
<
const
void
*
,
0
>
{}
,
empty_array
()
,
out_device_buf
.
GetDeviceBuffer
(),
out_device_buf
.
GetDeviceBuffer
(),
a
_g_n_c_wis_
l
engths
,
to_array
(
in
_g_n_c_wis_
desc
.
GetL
engths
())
,
a
_g_n_c_wis_
s
trides
,
to_array
(
in
_g_n_c_wis_
desc
.
GetS
trides
())
,
b
_g_k_c_xs_
l
engths
,
to_array
(
wei
_g_k_c_xs_
desc
.
GetL
engths
())
,
b
_g_k_c_xs_
s
trides
,
to_array
(
wei
_g_k_c_xs_
desc
.
GetS
trides
())
,
std
::
array
<
std
::
array
<
ck
::
index_t
,
NDimSpatial
+
3
>
,
0
>
{{}}
,
empty_array
()
,
std
::
array
<
std
::
array
<
ck
::
index_t
,
NDimSpatial
+
3
>
,
0
>
{{}}
,
empty_array
()
,
e
_g_n_k_wos_
l
engths
,
to_array
(
out
_g_n_k_wos_
desc
.
GetL
engths
())
,
e
_g_n_k_wos_
s
trides
,
to_array
(
out
_g_n_k_wos_
desc
.
GetS
trides
())
,
conv_filter_strides
,
to_array
(
conv_param
.
conv_filter_strides
_
)
,
conv_filter_dilations
,
to_array
(
conv_param
.
conv_filter_dilations
_
)
,
input_left_pads
,
to_array
(
conv_param
.
input_left_pads
_
)
,
input_right_pads
,
to_array
(
conv_param
.
input_right_pads
_
)
,
in_element_op
,
in_element_op
,
wei_element_op
,
wei_element_op
,
out_element_op
);
out_element_op
);
...
@@ -161,10 +140,10 @@ bool run_grouped_conv_fwd(bool do_verification,
...
@@ -161,10 +140,10 @@ bool run_grouped_conv_fwd(bool do_verification,
ref_invoker
.
Run
(
ref_argument
);
ref_invoker
.
Run
(
ref_argument
);
out_device_buf
.
FromDevice
(
out_device
.
mData
.
data
());
out_device_buf
.
FromDevice
(
out_device
.
data
());
return
ck
::
utils
::
check_err
(
return
ck
::
utils
::
check_err
(
out_device
.
mData
,
out_host
.
mData
,
"Error: incorrect results!"
,
1e-5
f
,
1e-4
f
);
out_device
,
out_host
,
"Error: incorrect results!"
,
1e-5
f
,
1e-4
f
);
}
}
return
true
;
return
true
;
...
...
example/10_convnd_fwd_multiple_d_multiple_reduce/common.hpp
View file @
e4e99a49
...
@@ -16,6 +16,9 @@
...
@@ -16,6 +16,9 @@
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp"
#include "ck/library/utility/algorithm.hpp"
#include "ck/library/utility/array.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/convolution_parameter.hpp"
#include "ck/library/utility/convolution_parameter.hpp"
#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
...
@@ -23,7 +26,6 @@
...
@@ -23,7 +26,6 @@
#include "ck/library/utility/fill.hpp"
#include "ck/library/utility/fill.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp"
using
BF16
=
ck
::
bhalf_t
;
using
BF16
=
ck
::
bhalf_t
;
using
FP16
=
ck
::
half_t
;
using
FP16
=
ck
::
half_t
;
...
@@ -140,9 +142,7 @@ make_r0_host_tensor_descriptor(const ck::utils::conv::ConvParam& problem_size)
...
@@ -140,9 +142,7 @@ make_r0_host_tensor_descriptor(const ck::utils::conv::ConvParam& problem_size)
{
{
std
::
vector
<
ck
::
index_t
>
dimensions
{
problem_size
.
G_
,
problem_size
.
N_
};
std
::
vector
<
ck
::
index_t
>
dimensions
{
problem_size
.
G_
,
problem_size
.
N_
};
std
::
copy
(
begin
(
problem_size
.
output_spatial_lengths_
),
ck
::
ranges
::
copy
(
problem_size
.
output_spatial_lengths_
,
std
::
back_inserter
(
dimensions
));
end
(
problem_size
.
output_spatial_lengths_
),
std
::
back_inserter
(
dimensions
));
return
HostTensorDescriptor
(
dimensions
);
return
HostTensorDescriptor
(
dimensions
);
}
}
...
@@ -158,10 +158,3 @@ void unpack_host_tensor_descriptor(const HostTensorDescriptor& descriptor,
...
@@ -158,10 +158,3 @@ void unpack_host_tensor_descriptor(const HostTensorDescriptor& descriptor,
assert
(
size
(
descriptor
.
GetStrides
())
==
size
(
strides
));
assert
(
size
(
descriptor
.
GetStrides
())
==
size
(
strides
));
std
::
copy_n
(
begin
(
descriptor
.
GetStrides
()),
size
(
descriptor
.
GetStrides
()),
begin
(
strides
));
std
::
copy_n
(
begin
(
descriptor
.
GetStrides
()),
size
(
descriptor
.
GetStrides
()),
begin
(
strides
));
}
}
template
<
typename
Range
,
typename
OutputIterator
>
auto
copy
(
const
Range
&
range
,
OutputIterator
iter
)
->
decltype
(
std
::
copy
(
std
::
begin
(
range
),
std
::
end
(
range
),
iter
))
{
return
std
::
copy
(
std
::
begin
(
range
),
std
::
end
(
range
),
iter
);
}
example/10_convnd_fwd_multiple_d_multiple_reduce/run_convnd_fwd_max_example.inc
View file @
e4e99a49
...
@@ -77,32 +77,28 @@ bool run_convnd_fwd_max(const ck::utils::conv::ConvParam& problem_size,
...
@@ -77,32 +77,28 @@ bool run_convnd_fwd_max(const ck::utils::conv::ConvParam& problem_size,
{
{
case
0
:
break
;
case
0
:
break
;
case
1
:
case
1
:
ck
::
utils
::
FillUniformDistributionIntegerValue
<
ADataType
>
{
-
8
,
7
}(
conv_input
.
begin
(),
ck
::
utils
::
FillUniformDistributionIntegerValue
<
ADataType
>
{
-
8
,
7
}(
conv_input
);
conv_input
.
end
());
ck
::
utils
::
FillUniformDistributionIntegerValue
<
BDataType
>
{
-
8
,
7
}(
conv_weight
);
ck
::
utils
::
FillUniformDistributionIntegerValue
<
BDataType
>
{
-
8
,
7
}(
conv_weight
.
begin
(),
conv_weight
.
end
());
break
;
break
;
default
:
default
:
ck
::
utils
::
FillUniformDistribution
<
ADataType
>
{
-
5
,
5
}(
conv_input
.
begin
(),
conv_input
.
end
());
ck
::
utils
::
FillUniformDistribution
<
ADataType
>
{
-
5
,
5
}(
conv_input
);
ck
::
utils
::
FillUniformDistribution
<
BDataType
>
{
-
5
,
5
}(
conv_weight
.
begin
(),
ck
::
utils
::
FillUniformDistribution
<
BDataType
>
{
-
5
,
5
}(
conv_weight
);
conv_weight
.
end
());
}
}
DeviceMem
conv_input_device_buf
(
sizeof
(
ADataType
)
*
conv_input
.
mDesc
.
GetElementSpaceSize
());
DeviceMem
conv_input_device_buf
(
conv_input
.
GetMemorySize
());
DeviceMem
conv_weight_device_buf
(
sizeof
(
BDataType
)
*
conv_weight
.
mDesc
.
GetElementSpaceSize
());
DeviceMem
conv_weight_device_buf
(
conv_weight
.
GetMemorySize
());
DeviceMem
conv_output_device_buf
(
sizeof
(
EDataType
)
*
DeviceMem
conv_output_device_buf
(
conv_output_device
.
GetMemorySize
());
conv_output_device
.
mDesc
.
GetElementSpaceSize
());
DeviceMem
r0_device_buf
(
r0_device
.
GetMemorySize
());
DeviceMem
r0_device_buf
(
sizeof
(
R0DataType
)
*
r0_device
.
mDesc
.
GetElementSpaceSize
());
#ifdef BUILD_INT4_EXAMPLE
#ifdef BUILD_INT4_EXAMPLE
const
Tensor
<
KernelADataType
>
conv_input_converted
(
conv_input
);
const
Tensor
<
KernelADataType
>
conv_input_converted
(
conv_input
);
const
Tensor
<
KernelBDataType
>
conv_weight_converted
(
conv_weight
);
const
Tensor
<
KernelBDataType
>
conv_weight_converted
(
conv_weight
);
conv_input_device_buf
.
ToDevice
(
conv_input_converted
.
mData
.
data
());
conv_input_device_buf
.
ToDevice
(
conv_input_converted
.
data
());
conv_weight_device_buf
.
ToDevice
(
conv_weight_converted
.
mData
.
data
());
conv_weight_device_buf
.
ToDevice
(
conv_weight_converted
.
data
());
#else
#else
conv_input_device_buf
.
ToDevice
(
conv_input
.
mData
.
data
());
conv_input_device_buf
.
ToDevice
(
conv_input
.
data
());
conv_weight_device_buf
.
ToDevice
(
conv_weight
.
mData
.
data
());
conv_weight_device_buf
.
ToDevice
(
conv_weight
.
data
());
#endif
#endif
std
::
array
<
ck
::
index_t
,
NDimSpatial
+
3
>
conv_input_g_n_c_wis_lengths
{},
std
::
array
<
ck
::
index_t
,
NDimSpatial
+
3
>
conv_input_g_n_c_wis_lengths
{},
...
@@ -112,8 +108,6 @@ bool run_convnd_fwd_max(const ck::utils::conv::ConvParam& problem_size,
...
@@ -112,8 +108,6 @@ bool run_convnd_fwd_max(const ck::utils::conv::ConvParam& problem_size,
std
::
array
<
ck
::
index_t
,
NDimSpatial
+
3
>
conv_output_g_n_k_wos_lengths
{},
std
::
array
<
ck
::
index_t
,
NDimSpatial
+
3
>
conv_output_g_n_k_wos_lengths
{},
conv_output_g_n_k_wos_strides
{};
conv_output_g_n_k_wos_strides
{};
std
::
array
<
ck
::
index_t
,
NDimSpatial
+
2
>
r0_lengths
{},
r0_strides
{};
std
::
array
<
ck
::
index_t
,
NDimSpatial
+
2
>
r0_lengths
{},
r0_strides
{};
std
::
array
<
ck
::
index_t
,
NDimSpatial
>
conv_filter_strides
{},
conv_filter_dilations
{};
std
::
array
<
ck
::
index_t
,
NDimSpatial
>
input_left_pads
{},
input_right_pads
{};
unpack_host_tensor_descriptor
(
unpack_host_tensor_descriptor
(
conv_input_g_n_c_wis_desc
,
conv_input_g_n_c_wis_lengths
,
conv_input_g_n_c_wis_strides
);
conv_input_g_n_c_wis_desc
,
conv_input_g_n_c_wis_lengths
,
conv_input_g_n_c_wis_strides
);
...
@@ -123,33 +117,30 @@ bool run_convnd_fwd_max(const ck::utils::conv::ConvParam& problem_size,
...
@@ -123,33 +117,30 @@ bool run_convnd_fwd_max(const ck::utils::conv::ConvParam& problem_size,
conv_output_g_n_k_wos_desc
,
conv_output_g_n_k_wos_lengths
,
conv_output_g_n_k_wos_strides
);
conv_output_g_n_k_wos_desc
,
conv_output_g_n_k_wos_lengths
,
conv_output_g_n_k_wos_strides
);
unpack_host_tensor_descriptor
(
r0_desc
,
r0_lengths
,
r0_strides
);
unpack_host_tensor_descriptor
(
r0_desc
,
r0_lengths
,
r0_strides
);
copy
(
problem_size
.
conv_filter_strides_
,
begin
(
conv_filter_strides
));
using
ck
::
utils
::
empty_array
,
ck
::
utils
::
to_array
;
copy
(
problem_size
.
conv_filter_dilations_
,
begin
(
conv_filter_dilations
));
copy
(
problem_size
.
input_left_pads_
,
begin
(
input_left_pads
));
copy
(
problem_size
.
input_right_pads_
,
begin
(
input_right_pads
));
// run Conv + Reduction on device
// run Conv + Reduction on device
auto
conv
=
DeviceInstance
<
NDimSpatial
>
{};
auto
conv
=
DeviceInstance
<
NDimSpatial
>
{};
auto
invoker
=
conv
.
MakeInvoker
();
auto
invoker
=
conv
.
MakeInvoker
();
auto
argument
=
conv
.
MakeArgument
(
conv_input_device_buf
.
GetDeviceBuffer
(),
auto
argument
=
conv
.
MakeArgument
(
conv_input_device_buf
.
GetDeviceBuffer
(),
conv_weight_device_buf
.
GetDeviceBuffer
(),
conv_weight_device_buf
.
GetDeviceBuffer
(),
std
::
array
<
const
void
*
,
0
>
{}
,
empty_array
()
,
conv_output_device_buf
.
GetDeviceBuffer
(),
conv_output_device_buf
.
GetDeviceBuffer
(),
{
r0_device_buf
.
GetDeviceBuffer
()},
{
r0_device_buf
.
GetDeviceBuffer
()},
conv_input_g_n_c_wis_lengths
,
conv_input_g_n_c_wis_lengths
,
conv_input_g_n_c_wis_strides
,
conv_input_g_n_c_wis_strides
,
conv_weight_g_k_c_xs_lengths
,
conv_weight_g_k_c_xs_lengths
,
conv_weight_g_k_c_xs_strides
,
conv_weight_g_k_c_xs_strides
,
std
::
array
<
std
::
array
<
ck
::
index_t
,
NDimSpatial
+
3
>
,
0
>
{{}}
,
empty_array
()
,
std
::
array
<
std
::
array
<
ck
::
index_t
,
NDimSpatial
+
3
>
,
0
>
{{}}
,
empty_array
()
,
conv_output_g_n_k_wos_lengths
,
conv_output_g_n_k_wos_lengths
,
conv_output_g_n_k_wos_strides
,
conv_output_g_n_k_wos_strides
,
r0_lengths
,
r0_lengths
,
r0_strides
,
r0_strides
,
conv_filter_strides
,
to_array
(
problem_size
.
conv_filter_strides
_
)
,
conv_filter_dilations
,
to_array
(
problem_size
.
conv_filter_dilations
_
)
,
input_left_pads
,
to_array
(
problem_size
.
input_left_pads
_
)
,
input_right_pads
,
to_array
(
problem_size
.
input_right_pads
_
)
,
AElementOp
{},
AElementOp
{},
BElementOp
{},
BElementOp
{},
CDEElementOp
{},
CDEElementOp
{},
...
@@ -194,11 +185,11 @@ bool run_convnd_fwd_max(const ck::utils::conv::ConvParam& problem_size,
...
@@ -194,11 +185,11 @@ bool run_convnd_fwd_max(const ck::utils::conv::ConvParam& problem_size,
ref_invoker
.
Run
(
ref_argument
);
ref_invoker
.
Run
(
ref_argument
);
Tensor
<
R0DataType
>
r0_host
(
r0_device
.
m
Desc
);
Tensor
<
R0DataType
>
r0_host
(
r0_device
.
Get
Desc
()
);
auto
reduce0_op
=
RsThreadReduceOp
{}[
ck
::
Number
<
0
>
{}];
auto
reduce0_op
=
RsThreadReduceOp
{}[
ck
::
Number
<
0
>
{}];
auto
&
output_dims
=
conv_output_g_n_k_wos_desc
.
GetLengths
();
auto
output_dims
=
conv_output_g_n_k_wos_desc
.
GetLengths
();
if
constexpr
(
NDimSpatial
==
1
)
if
constexpr
(
NDimSpatial
==
1
)
{
{
...
@@ -273,19 +264,16 @@ bool run_convnd_fwd_max(const ck::utils::conv::ConvParam& problem_size,
...
@@ -273,19 +264,16 @@ bool run_convnd_fwd_max(const ck::utils::conv::ConvParam& problem_size,
}
}
}
}
conv_output_device_buf
.
FromDevice
(
conv_output_device
.
mData
.
data
());
conv_output_device_buf
.
FromDevice
(
conv_output_device
.
data
());
r0_device_buf
.
FromDevice
(
r0_device
.
mData
.
data
());
r0_device_buf
.
FromDevice
(
r0_device
.
data
());
return
ck
::
utils
::
check_err
(
conv_output_device
.
mData
,
return
ck
::
utils
::
check_err
(
conv_output_device
,
conv_output_host
.
mData
,
conv_output_host
,
"Error: incorrect results! (Matrix E)"
,
"Error: incorrect results! (Matrix E)"
,
1
e
-
5
f
,
1
e
-
5
f
,
1
e
-
4
f
)
&&
1
e
-
4
f
)
&&
ck
::
utils
::
check_err
(
r0_device
.
mData
,
ck
::
utils
::
check_err
(
r0_host
.
mData
,
r0_device
,
r0_host
,
"Error: incorrect results! (Matrix R0)"
,
1
e
-
5
f
,
1
e
-
4
f
);
"Error: incorrect results! (Matrix R0)"
,
1
e
-
5
f
,
1
e
-
4
f
);
}
}
return
true
;
return
true
;
...
...
example/12_reduce/reduce_blockwise_impl.hpp
View file @
e4e99a49
...
@@ -7,15 +7,17 @@
...
@@ -7,15 +7,17 @@
#include "ck/ck.hpp"
#include "ck/ck.hpp"
#include "ck/utility/reduction_enums.hpp"
#include "ck/utility/reduction_enums.hpp"
#include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp"
#include "ck/tensor_operation/gpu/device/device_reduce_multiblock.hpp"
#include "ck/tensor_operation/gpu/device/device_reduce_multiblock.hpp"
#include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp"
#include "ck/library/utility/algorithm.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/host_common_util.hpp"
#include "ck/library/utility/host_common_util.hpp"
#include "ck/library/utility/host_reduction.hpp"
#include "ck/library/utility/host_reduction.hpp"
#include "ck/library/utility/ranges.hpp"
#include "reduce_example_common.hpp"
#include "reduce_example_common.hpp"
...
@@ -156,11 +158,11 @@ int reduce_blockwise_impl(bool do_verification,
...
@@ -156,11 +158,11 @@ int reduce_blockwise_impl(bool do_verification,
Tensor
<
int
>
out_indices_ref
(
outLengths
);
Tensor
<
int
>
out_indices_ref
(
outLengths
);
Tensor
<
int
>
out_indices
(
outLengths
);
Tensor
<
int
>
out_indices
(
outLengths
);
auto
inStrides
=
in
.
mDesc
.
GetStrides
();
auto
inStrides
=
in
.
GetStrides
();
auto
outStrides
=
out
.
mDesc
.
GetStrides
();
auto
outStrides
=
out
.
GetStrides
();
size_t
invariant_total_length
=
out
.
mDesc
.
GetElementSize
();
size_t
invariant_total_length
=
out
.
GetElementSize
();
size_t
reduce_total_length
=
in
.
mDesc
.
GetElementSize
()
/
invariant_total_length
;
size_t
reduce_total_length
=
in
.
GetElementSize
()
/
invariant_total_length
;
std
::
size_t
num_thread
=
1
;
std
::
size_t
num_thread
=
1
;
...
@@ -187,42 +189,43 @@ int reduce_blockwise_impl(bool do_verification,
...
@@ -187,42 +189,43 @@ int reduce_blockwise_impl(bool do_verification,
}
}
if
(
beta
!=
0.0
f
)
if
(
beta
!=
0.0
f
)
for
(
size_t
i
=
0
;
i
<
out_ref
.
mDesc
.
GetElementSpaceSize
();
i
++
)
{
out
.
mData
[
i
]
=
out_ref
.
mData
[
i
];
ck
::
ranges
::
copy
(
out_ref
,
out
.
begin
());
}
};
};
// these buffers are usually provided by the user application
// these buffers are usually provided by the user application
DeviceMem
in_dev
(
sizeof
(
InOutDataTypeInDevice
)
*
in
.
mDesc
.
GetElementSpace
Size
());
DeviceMem
in_dev
(
in
.
GetMemory
Size
());
DeviceMem
out_dev
(
sizeof
(
InOutDataTypeInDevice
)
*
out
.
mDesc
.
GetElementSpace
Size
());
DeviceMem
out_dev
(
out
.
GetMemory
Size
());
#ifdef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
#ifdef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
if
(
std
::
is_same
<
InOutDataType
,
int4_t
>::
value
)
if
(
std
::
is_same
<
InOutDataType
,
int4_t
>::
value
)
{
{
std
::
vector
<
InOutDataTypeInDevice
>
tmp_buf
(
in
.
mData
.
size
());
std
::
vector
<
InOutDataTypeInDevice
>
tmp_buf
(
in
.
size
());
std
::
copy_n
(
in
.
mData
.
data
(),
in
.
mData
.
size
(),
tmp_buf
.
data
());
std
::
copy_n
(
in
.
data
(),
in
.
size
(),
tmp_buf
.
data
());
in_dev
.
ToDevice
(
tmp_buf
.
data
());
in_dev
.
ToDevice
(
tmp_buf
.
data
());
}
}
else
else
#endif
#endif
in_dev
.
ToDevice
(
in
.
mData
.
data
());
in_dev
.
ToDevice
(
in
.
data
());
if
(
beta
!=
0.0
f
)
if
(
beta
!=
0.0
f
)
{
{
#ifdef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
#ifdef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
if
(
std
::
is_same
<
InOutDataType
,
int4_t
>::
value
)
if
(
std
::
is_same
<
InOutDataType
,
int4_t
>::
value
)
{
{
std
::
vector
<
InOutDataTypeInDevice
>
tmp_buf
(
in
.
mData
.
size
());
std
::
vector
<
InOutDataTypeInDevice
>
tmp_buf
(
in
.
size
());
std
::
copy_n
(
out
.
mData
.
data
(),
out
.
mData
.
size
(),
tmp_buf
.
data
());
std
::
copy_n
(
out
.
data
(),
out
.
size
(),
tmp_buf
.
data
());
out_dev
.
ToDevice
(
tmp_buf
.
data
());
out_dev
.
ToDevice
(
tmp_buf
.
data
());
}
}
else
else
#endif
#endif
out_dev
.
ToDevice
(
out
.
mData
.
data
());
out_dev
.
ToDevice
(
out
.
data
());
};
};
size_t
indicesSizeInBytes
=
OutputIndex
?
out
.
mDesc
.
GetElementSize
()
*
sizeof
(
int32_t
)
:
0
;
size_t
indicesSizeInBytes
=
OutputIndex
?
out
.
GetElementSize
()
*
sizeof
(
int32_t
)
:
0
;
DeviceMem
out_index_dev
(
indicesSizeInBytes
);
DeviceMem
out_index_dev
(
indicesSizeInBytes
);
...
@@ -245,33 +248,25 @@ int reduce_blockwise_impl(bool do_verification,
...
@@ -245,33 +248,25 @@ int reduce_blockwise_impl(bool do_verification,
NumReduceDim
,
NumReduceDim
,
PropagateNan
,
PropagateNan
,
OutputIndex
>
OutputIndex
>
hostReduce
(
in
.
m
Desc
,
out_ref
.
m
Desc
,
invariantDims
,
reduceDims
);
hostReduce
(
in
.
Get
Desc
()
,
out_ref
.
Get
Desc
()
,
invariantDims
,
reduceDims
);
hostReduce
.
Run
(
alpha
,
hostReduce
.
Run
(
alpha
,
in
.
mData
.
data
(),
in
.
data
(),
beta
,
beta
,
out_ref
.
mData
.
data
(),
out_ref
.
data
(),
out_indices_ref
.
mData
.
data
(),
out_indices_ref
.
data
(),
in_elementwise_op
,
in_elementwise_op
,
acc_elementwise_op
);
acc_elementwise_op
);
};
};
std
::
vector
<
ck
::
index_t
>
i_inLengths
;
using
Indices
=
std
::
vector
<
ck
::
index_t
>
;
std
::
vector
<
ck
::
index_t
>
i_inStrides
;
std
::
vector
<
ck
::
index_t
>
i_outLengths
;
std
::
vector
<
ck
::
index_t
>
i_outStrides
;
i_inLengths
.
assign
(
inLengths
.
begin
(),
inLengths
.
end
());
i_inStrides
.
assign
(
inStrides
.
begin
(),
inStrides
.
end
());
i_outLengths
.
assign
(
outLengths
.
begin
(),
outLengths
.
end
());
i_outStrides
.
assign
(
outStrides
.
begin
(),
outStrides
.
end
());
auto
reduce
=
DeviceReduceInstance
{};
auto
reduce
=
DeviceReduceInstance
{};
auto
argument_ptr
=
reduce
.
MakeArgumentPointer
(
i_
inLengths
,
auto
argument_ptr
=
reduce
.
MakeArgumentPointer
(
ck
::
ranges
::
to
<
Indices
>
(
inLengths
)
,
i_
inStrides
,
ck
::
ranges
::
to
<
Indices
>
(
inStrides
)
,
i_
outLengths
,
ck
::
ranges
::
to
<
Indices
>
(
outLengths
)
,
i_
outStrides
,
ck
::
ranges
::
to
<
Indices
>
(
outStrides
)
,
reduceDims
,
reduceDims
,
alpha
,
alpha
,
beta
,
beta
,
...
@@ -312,22 +307,22 @@ int reduce_blockwise_impl(bool do_verification,
...
@@ -312,22 +307,22 @@ int reduce_blockwise_impl(bool do_verification,
#ifdef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
#ifdef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
if
(
std
::
is_same
<
InOutDataType
,
int4_t
>::
value
)
if
(
std
::
is_same
<
InOutDataType
,
int4_t
>::
value
)
{
{
std
::
vector
<
InOutDataTypeInDevice
>
tmp_buf
(
out
.
mData
.
size
());
std
::
vector
<
InOutDataTypeInDevice
>
tmp_buf
(
out
.
size
());
out_dev
.
FromDevice
(
tmp_buf
.
data
());
out_dev
.
FromDevice
(
tmp_buf
.
data
());
std
::
copy_n
(
tmp_buf
.
data
(),
out
.
mData
.
size
(),
out
.
mData
.
data
());
std
::
copy_n
(
tmp_buf
.
data
(),
out
.
size
(),
out
.
data
());
}
}
else
else
#endif
#endif
out_dev
.
FromDevice
(
out
.
mData
.
data
());
out_dev
.
FromDevice
(
out
.
data
());
pass
=
pass
&&
ck
::
utils
::
check_err
(
out
.
mData
,
out_ref
.
mData
);
pass
=
pass
&&
ck
::
utils
::
check_err
(
out
,
out_ref
);
if
(
OutputIndex
)
if
(
OutputIndex
)
{
{
out_index_dev
.
FromDevice
(
out_indices
.
mData
.
data
());
out_index_dev
.
FromDevice
(
out_indices
.
data
());
pass
=
pass
&&
ck
::
utils
::
check_err
(
out_indices
.
mData
,
out_indices_ref
.
mData
);
pass
=
pass
&&
ck
::
utils
::
check_err
(
out_indices
,
out_indices_ref
);
};
};
};
};
...
...
example/12_reduce/reduce_blockwise_two_call.cpp
View file @
e4e99a49
// SPDX-License-Identifier: MIT
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <cstdlib>
#include <initializer_list>
#include <iostream>
#include <iostream>
#include <numeric>
#include <numeric>
#include <sstream>
#include <sstream>
#include <initializer_list>
#include <cstdlib>
#include <getopt.h>
#include <getopt.h>
#include "ck/ck.hpp"
#include "ck/ck.hpp"
#include "ck/utility/reduction_enums.hpp"
#include "ck/utility/reduction_enums.hpp"
#include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp"
#include "ck/tensor_operation/gpu/device/device_reduce_multiblock.hpp"
#include "ck/tensor_operation/gpu/device/device_reduce_multiblock.hpp"
#include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp"
#include "ck/library/utility/algorithm.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/host_common_util.hpp"
#include "ck/library/utility/host_common_util.hpp"
#include "ck/library/utility/host_reduction.hpp"
#include "ck/library/utility/host_reduction.hpp"
#include "ck/library/utility/ranges.hpp"
using
namespace
ck
;
using
namespace
ck
;
using
namespace
ck
::
tensor_operation
::
device
;
using
namespace
ck
::
tensor_operation
::
device
;
...
@@ -139,12 +142,12 @@ int main(int argc, char* argv[])
...
@@ -139,12 +142,12 @@ int main(int argc, char* argv[])
Tensor
<
InOutDataType
>
in_2
(
inLengths_2
);
// also the output tensor of the first reduction
Tensor
<
InOutDataType
>
in_2
(
inLengths_2
);
// also the output tensor of the first reduction
Tensor
<
InOutDataType
>
out
(
outLengths
);
Tensor
<
InOutDataType
>
out
(
outLengths
);
auto
inStrides_1
=
in_1
.
mDesc
.
GetStrides
();
auto
inStrides_1
=
in_1
.
GetStrides
();
auto
inStrides_2
=
in_2
.
mDesc
.
GetStrides
();
auto
inStrides_2
=
in_2
.
GetStrides
();
auto
outStrides
=
out
.
mDesc
.
GetStrides
();
auto
outStrides
=
out
.
GetStrides
();
size_t
invariant_total_length
=
out
.
mDesc
.
GetElementSize
();
size_t
invariant_total_length
=
out
.
GetElementSize
();
size_t
reduce_total_length
=
in_1
.
mDesc
.
GetElementSize
()
/
invariant_total_length
;
size_t
reduce_total_length
=
in_1
.
GetElementSize
()
/
invariant_total_length
;
std
::
size_t
num_thread
=
1
;
std
::
size_t
num_thread
=
1
;
...
@@ -171,18 +174,19 @@ int main(int argc, char* argv[])
...
@@ -171,18 +174,19 @@ int main(int argc, char* argv[])
}
}
if
(
beta
!=
0.0
f
)
if
(
beta
!=
0.0
f
)
for
(
size_t
i
=
0
;
i
<
out_ref
.
mDesc
.
GetElementSpaceSize
();
i
++
)
{
out
.
mData
[
i
]
=
out_ref
.
mData
[
i
];
ck
::
ranges
::
copy
(
out_ref
,
out
.
begin
());
}
};
};
DeviceMem
in_1_dev
(
sizeof
(
InOutDataType
)
*
in_1
.
mDesc
.
GetElementSpace
Size
());
DeviceMem
in_1_dev
(
in_1
.
GetMemory
Size
());
DeviceMem
in_2_dev
(
sizeof
(
InOutDataType
)
*
in_2
.
mDesc
.
GetElementSpace
Size
());
DeviceMem
in_2_dev
(
in_2
.
GetMemory
Size
());
DeviceMem
out_dev
(
sizeof
(
InOutDataType
)
*
out
.
mDesc
.
GetElementSpace
Size
());
DeviceMem
out_dev
(
out
.
GetMemory
Size
());
in_1_dev
.
ToDevice
(
in_1
.
mData
.
data
());
in_1_dev
.
ToDevice
(
in_1
.
data
());
if
(
beta
!=
0.0
f
)
if
(
beta
!=
0.0
f
)
out_dev
.
ToDevice
(
out
.
mData
.
data
());
out_dev
.
ToDevice
(
out
.
data
());
InElementwiseOperation
in_elementwise_op
;
InElementwiseOperation
in_elementwise_op
;
AccElementwiseOperation
acc_elementwise_op
;
AccElementwiseOperation
acc_elementwise_op
;
...
@@ -203,37 +207,25 @@ int main(int argc, char* argv[])
...
@@ -203,37 +207,25 @@ int main(int argc, char* argv[])
2
,
// NumReduceDim
2
,
// NumReduceDim
PropagateNan
,
PropagateNan
,
OutputIndex
>
OutputIndex
>
hostReduce
(
in_1
.
m
Desc
,
out_ref
.
m
Desc
,
invariantDims
,
reduceDims
);
hostReduce
(
in_1
.
Get
Desc
()
,
out_ref
.
Get
Desc
()
,
invariantDims
,
reduceDims
);
hostReduce
.
Run
(
alpha
,
hostReduce
.
Run
(
alpha
,
in_1
.
mData
.
data
(),
in_1
.
data
(),
beta
,
beta
,
out_ref
.
mData
.
data
(),
out_ref
.
data
(),
nullptr
,
nullptr
,
in_elementwise_op
,
in_elementwise_op
,
acc_elementwise_op
);
acc_elementwise_op
);
};
};
std
::
vector
<
ck
::
index_t
>
i_inLengths_1
;
using
Indices
=
std
::
vector
<
ck
::
index_t
>
;
std
::
vector
<
ck
::
index_t
>
i_inStrides_1
;
std
::
vector
<
ck
::
index_t
>
i_inLengths_2
;
std
::
vector
<
ck
::
index_t
>
i_inStrides_2
;
std
::
vector
<
ck
::
index_t
>
i_outLengths
;
std
::
vector
<
ck
::
index_t
>
i_outStrides
;
i_inLengths_1
.
assign
(
inLengths_1
.
begin
(),
inLengths_1
.
end
());
i_inStrides_1
.
assign
(
inStrides_1
.
begin
(),
inStrides_1
.
end
());
i_inLengths_2
.
assign
(
inLengths_2
.
begin
(),
inLengths_2
.
end
());
i_inStrides_2
.
assign
(
inStrides_2
.
begin
(),
inStrides_2
.
end
());
i_outLengths
.
assign
(
outLengths
.
begin
(),
outLengths
.
end
());
i_outStrides
.
assign
(
outStrides
.
begin
(),
outStrides
.
end
());
auto
reduce_1
=
DeviceReduceInstance_1
{};
auto
reduce_1
=
DeviceReduceInstance_1
{};
auto
argument_ptr_1
=
reduce_1
.
MakeArgumentPointer
(
i_
inLengths_1
,
auto
argument_ptr_1
=
reduce_1
.
MakeArgumentPointer
(
ck
::
ranges
::
to
<
Indices
>
(
inLengths_1
)
,
i_
inStrides_1
,
ck
::
ranges
::
to
<
Indices
>
(
inStrides_1
)
,
i_
inLengths_2
,
ck
::
ranges
::
to
<
Indices
>
(
inLengths_2
)
,
i_
inStrides_2
,
ck
::
ranges
::
to
<
Indices
>
(
inStrides_2
)
,
reduceDims_1
,
reduceDims_1
,
1.0
f
,
1.0
f
,
0.0
f
,
0.0
f
,
...
@@ -255,10 +247,10 @@ int main(int argc, char* argv[])
...
@@ -255,10 +247,10 @@ int main(int argc, char* argv[])
auto
reduce_2
=
DeviceReduceInstance_2
{};
auto
reduce_2
=
DeviceReduceInstance_2
{};
auto
argument_ptr_2
=
reduce_2
.
MakeArgumentPointer
(
i_
inLengths_2
,
auto
argument_ptr_2
=
reduce_2
.
MakeArgumentPointer
(
ck
::
ranges
::
to
<
Indices
>
(
inLengths_2
)
,
i_
inStrides_2
,
ck
::
ranges
::
to
<
Indices
>
(
inStrides_2
)
,
i_
outLengths
,
ck
::
ranges
::
to
<
Indices
>
(
outLengths
)
,
i_
outStrides
,
ck
::
ranges
::
to
<
Indices
>
(
outStrides
)
,
reduceDims_2
,
reduceDims_2
,
alpha
,
alpha
,
beta
,
beta
,
...
@@ -293,8 +285,8 @@ int main(int argc, char* argv[])
...
@@ -293,8 +285,8 @@ int main(int argc, char* argv[])
if
(
do_verify
)
if
(
do_verify
)
{
{
out_dev
.
FromDevice
(
out
.
mData
.
data
());
out_dev
.
FromDevice
(
out
.
data
());
pass
=
pass
&&
ck
::
utils
::
check_err
(
out
.
mData
,
out_ref
.
mData
);
pass
=
pass
&&
ck
::
utils
::
check_err
(
out
,
out_ref
);
};
};
return
(
pass
?
0
:
1
);
return
(
pass
?
0
:
1
);
...
...
example/12_reduce/reduce_multiblock_atomic_add_impl.hpp
View file @
e4e99a49
...
@@ -7,15 +7,17 @@
...
@@ -7,15 +7,17 @@
#include "ck/ck.hpp"
#include "ck/ck.hpp"
#include "ck/utility/reduction_enums.hpp"
#include "ck/utility/reduction_enums.hpp"
#include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp"
#include "ck/tensor_operation/gpu/device/device_reduce_multiblock.hpp"
#include "ck/tensor_operation/gpu/device/device_reduce_multiblock.hpp"
#include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp"
#include "ck/library/utility/algorithm.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/host_common_util.hpp"
#include "ck/library/utility/host_common_util.hpp"
#include "ck/library/utility/host_reduction.hpp"
#include "ck/library/utility/host_reduction.hpp"
#include "ck/library/utility/ranges.hpp"
#include "reduce_example_common.hpp"
#include "reduce_example_common.hpp"
...
@@ -95,11 +97,11 @@ int reduce_multiblock_atomic_add_impl(bool do_verification,
...
@@ -95,11 +97,11 @@ int reduce_multiblock_atomic_add_impl(bool do_verification,
Tensor
<
InOutDataType
>
out_ref
(
outLengths
);
Tensor
<
InOutDataType
>
out_ref
(
outLengths
);
Tensor
<
InOutDataType
>
out
(
outLengths
);
Tensor
<
InOutDataType
>
out
(
outLengths
);
auto
inStrides
=
in
.
mDesc
.
GetStrides
();
auto
inStrides
=
in
.
GetStrides
();
auto
outStrides
=
out
.
mDesc
.
GetStrides
();
auto
outStrides
=
out
.
GetStrides
();
size_t
invariant_total_length
=
out
.
mDesc
.
GetElementSize
();
size_t
invariant_total_length
=
out
.
GetElementSize
();
size_t
reduce_total_length
=
in
.
mDesc
.
GetElementSize
()
/
invariant_total_length
;
size_t
reduce_total_length
=
in
.
GetElementSize
()
/
invariant_total_length
;
std
::
size_t
num_thread
=
1
;
std
::
size_t
num_thread
=
1
;
...
@@ -126,18 +128,19 @@ int reduce_multiblock_atomic_add_impl(bool do_verification,
...
@@ -126,18 +128,19 @@ int reduce_multiblock_atomic_add_impl(bool do_verification,
}
}
if
(
beta
!=
0.0
f
)
if
(
beta
!=
0.0
f
)
for
(
size_t
i
=
0
;
i
<
out_ref
.
mDesc
.
GetElementSpaceSize
();
i
++
)
{
out
.
mData
[
i
]
=
out_ref
.
mData
[
i
];
ck
::
ranges
::
copy
(
out_ref
,
out
.
begin
());
}
};
};
// these buffers are usually provided by the user application
// these buffers are usually provided by the user application
DeviceMem
in_dev
(
sizeof
(
InOutDataType
)
*
in
.
mDesc
.
GetElementSpace
Size
());
DeviceMem
in_dev
(
in
.
GetMemory
Size
());
DeviceMem
out_dev
(
sizeof
(
InOutDataType
)
*
out
.
mDesc
.
GetElementSpace
Size
());
DeviceMem
out_dev
(
out
.
GetMemory
Size
());
in_dev
.
ToDevice
(
in
.
mData
.
data
());
in_dev
.
ToDevice
(
in
.
data
());
if
(
beta
!=
0.0
f
)
if
(
beta
!=
0.0
f
)
out_dev
.
ToDevice
(
out
.
mData
.
data
());
out_dev
.
ToDevice
(
out
.
data
());
InElementwiseOperation
in_elementwise_op
;
InElementwiseOperation
in_elementwise_op
;
AccElementwiseOperation
acc_elementwise_op
;
AccElementwiseOperation
acc_elementwise_op
;
...
@@ -158,33 +161,20 @@ int reduce_multiblock_atomic_add_impl(bool do_verification,
...
@@ -158,33 +161,20 @@ int reduce_multiblock_atomic_add_impl(bool do_verification,
NumReduceDim
,
NumReduceDim
,
PropagateNan
,
PropagateNan
,
false
>
false
>
hostReduce
(
in
.
m
Desc
,
out_ref
.
m
Desc
,
invariantDims
,
reduceDims
);
hostReduce
(
in
.
Get
Desc
()
,
out_ref
.
Get
Desc
()
,
invariantDims
,
reduceDims
);
hostReduce
.
Run
(
alpha
,
hostReduce
.
Run
(
in
.
mData
.
data
(),
alpha
,
in
.
data
(),
beta
,
out_ref
.
data
(),
nullptr
,
in_elementwise_op
,
acc_elementwise_op
);
beta
,
out_ref
.
mData
.
data
(),
nullptr
,
in_elementwise_op
,
acc_elementwise_op
);
};
};
std
::
vector
<
ck
::
index_t
>
i_inLengths
;
using
Indices
=
std
::
vector
<
ck
::
index_t
>
;
std
::
vector
<
ck
::
index_t
>
i_inStrides
;
std
::
vector
<
ck
::
index_t
>
i_outLengths
;
std
::
vector
<
ck
::
index_t
>
i_outStrides
;
i_inLengths
.
assign
(
inLengths
.
begin
(),
inLengths
.
end
());
i_inStrides
.
assign
(
inStrides
.
begin
(),
inStrides
.
end
());
i_outLengths
.
assign
(
outLengths
.
begin
(),
outLengths
.
end
());
i_outStrides
.
assign
(
outStrides
.
begin
(),
outStrides
.
end
());
auto
reduce
=
DeviceReduceInstance
{};
auto
reduce
=
DeviceReduceInstance
{};
auto
argument_ptr
=
reduce
.
MakeArgumentPointer
(
i_
inLengths
,
auto
argument_ptr
=
reduce
.
MakeArgumentPointer
(
ck
::
ranges
::
to
<
Indices
>
(
inLengths
)
,
i_
inStrides
,
ck
::
ranges
::
to
<
Indices
>
(
inStrides
)
,
i_
outLengths
,
ck
::
ranges
::
to
<
Indices
>
(
outLengths
)
,
i_
outStrides
,
ck
::
ranges
::
to
<
Indices
>
(
outStrides
)
,
reduceDims
,
reduceDims
,
alpha
,
alpha
,
beta
,
beta
,
...
@@ -222,8 +212,8 @@ int reduce_multiblock_atomic_add_impl(bool do_verification,
...
@@ -222,8 +212,8 @@ int reduce_multiblock_atomic_add_impl(bool do_verification,
if
(
do_verification
)
if
(
do_verification
)
{
{
out_dev
.
FromDevice
(
out
.
mData
.
data
());
out_dev
.
FromDevice
(
out
.
data
());
pass
=
pass
&&
ck
::
utils
::
check_err
(
out
.
mData
,
out_ref
.
mData
);
pass
=
pass
&&
ck
::
utils
::
check_err
(
out
,
out_ref
);
};
};
return
(
pass
?
0
:
1
);
return
(
pass
?
0
:
1
);
...
...
example/13_pool2d_fwd/pool2d_fwd_common.hpp
View file @
e4e99a49
...
@@ -8,14 +8,16 @@
...
@@ -8,14 +8,16 @@
#include "ck/ck.hpp"
#include "ck/ck.hpp"
#include "ck/utility/reduction_enums.hpp"
#include "ck/utility/reduction_enums.hpp"
#include "ck/utility/reduction_functions_accumulate.hpp"
#include "ck/utility/reduction_functions_accumulate.hpp"
#include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp"
#include "ck/tensor_operation/gpu/device/device_pool2d_fwd_nhwc_nhwc.hpp"
#include "ck/tensor_operation/gpu/device/device_pool2d_fwd_nhwc_nhwc.hpp"
#include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/utility/array.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/literals.hpp"
template
<
typename
InDataType
,
template
<
typename
InDataType
,
typename
OutDataType
,
typename
OutDataType
,
...
@@ -56,8 +58,8 @@ static void pool_host_verify(const Tensor<InDataType>& in,
...
@@ -56,8 +58,8 @@ static void pool_host_verify(const Tensor<InDataType>& in,
for
(
ck
::
index_t
x
=
0
;
x
<
window_spatial_lengths
[
1
];
++
x
)
for
(
ck
::
index_t
x
=
0
;
x
<
window_spatial_lengths
[
1
];
++
x
)
{
{
ck
::
index_t
wi
=
wo
*
window_strides
[
1
]
+
x
-
in_left_pads
[
1
];
ck
::
index_t
wi
=
wo
*
window_strides
[
1
]
+
x
-
in_left_pads
[
1
];
if
(
hi
>=
0
&&
hi
<
static_cast
<
ck
::
index_t
>
(
in
.
mDesc
.
GetLengths
()[
2
])
&&
if
(
hi
>=
0
&&
hi
<
static_cast
<
ck
::
index_t
>
(
in
.
GetLengths
()[
2
])
&&
wi
>=
0
&&
wi
>=
0
&&
wi
<
static_cast
<
ck
::
index_t
>
(
in
.
mDesc
.
GetLengths
()[
3
]))
wi
<
static_cast
<
ck
::
index_t
>
(
in
.
GetLengths
()[
3
]))
{
{
AccDataType
currVal
=
static_cast
<
AccDataType
>
(
in
(
n
,
c
,
hi
,
wi
));
AccDataType
currVal
=
static_cast
<
AccDataType
>
(
in
(
n
,
c
,
hi
,
wi
));
...
@@ -74,10 +76,10 @@ static void pool_host_verify(const Tensor<InDataType>& in,
...
@@ -74,10 +76,10 @@ static void pool_host_verify(const Tensor<InDataType>& in,
};
};
make_ParallelTensorFunctor
(
f_nchw
,
make_ParallelTensorFunctor
(
f_nchw
,
out
.
mDesc
.
GetLengths
()[
0
],
out
.
GetLengths
()[
0
],
out
.
mDesc
.
GetLengths
()[
1
],
out
.
GetLengths
()[
1
],
out
.
mDesc
.
GetLengths
()[
2
],
out
.
GetLengths
()[
2
],
out
.
mDesc
.
GetLengths
()[
3
])(
std
::
thread
::
hardware_concurrency
());
out
.
GetLengths
()[
3
])(
std
::
thread
::
hardware_concurrency
());
}
}
else
else
{
{
...
@@ -95,8 +97,7 @@ static void pool_host_verify(const Tensor<InDataType>& in,
...
@@ -95,8 +97,7 @@ static void pool_host_verify(const Tensor<InDataType>& in,
for
(
ck
::
index_t
x
=
0
;
x
<
window_spatial_lengths
[
1
];
++
x
)
for
(
ck
::
index_t
x
=
0
;
x
<
window_spatial_lengths
[
1
];
++
x
)
{
{
ck
::
index_t
wi
=
wo
*
window_strides
[
1
]
+
x
-
in_left_pads
[
1
];
ck
::
index_t
wi
=
wo
*
window_strides
[
1
]
+
x
-
in_left_pads
[
1
];
if
(
hi
>=
0
&&
hi
<
in
.
mDesc
.
GetLengths
()[
2
]
&&
wi
>=
0
&&
if
(
hi
>=
0
&&
hi
<
in
.
GetLengths
()[
2
]
&&
wi
>=
0
&&
wi
<
in
.
GetLengths
()[
3
])
wi
<
in
.
mDesc
.
GetLengths
()[
3
])
{
{
AccDataType
currVal
=
static_cast
<
AccDataType
>
(
in
(
n
,
c
,
hi
,
wi
));
AccDataType
currVal
=
static_cast
<
AccDataType
>
(
in
(
n
,
c
,
hi
,
wi
));
IndexDataType
currIndex
=
y
*
window_spatial_lengths
[
1
]
+
x
;
IndexDataType
currIndex
=
y
*
window_spatial_lengths
[
1
]
+
x
;
...
@@ -115,10 +116,10 @@ static void pool_host_verify(const Tensor<InDataType>& in,
...
@@ -115,10 +116,10 @@ static void pool_host_verify(const Tensor<InDataType>& in,
};
};
make_ParallelTensorFunctor
(
f_nchw
,
make_ParallelTensorFunctor
(
f_nchw
,
out
.
mDesc
.
GetLengths
()[
0
],
out
.
GetLengths
()[
0
],
out
.
mDesc
.
GetLengths
()[
1
],
out
.
GetLengths
()[
1
],
out
.
mDesc
.
GetLengths
()[
2
],
out
.
GetLengths
()[
2
],
out
.
mDesc
.
GetLengths
()[
3
])(
std
::
thread
::
hardware_concurrency
());
out
.
GetLengths
()[
3
])(
std
::
thread
::
hardware_concurrency
());
};
};
}
}
...
@@ -169,19 +170,18 @@ bool pool_test(bool do_verification,
...
@@ -169,19 +170,18 @@ bool pool_test(bool do_verification,
const
std
::
array
<
ck
::
index_t
,
2
>
input_left_pads
{{
in_left_pad_h
,
in_left_pad_w
}};
const
std
::
array
<
ck
::
index_t
,
2
>
input_left_pads
{{
in_left_pad_h
,
in_left_pad_w
}};
const
std
::
array
<
ck
::
index_t
,
2
>
input_right_pads
{{
in_right_pad_h
,
in_right_pad_w
}};
const
std
::
array
<
ck
::
index_t
,
2
>
input_right_pads
{{
in_right_pad_h
,
in_right_pad_w
}};
using
namespace
ck
::
literals
;
// tensor layout
// tensor layout
auto
f_host_tensor_descriptor
=
auto
f_host_tensor_descriptor
=
[](
std
::
size_t
N_
,
std
::
size_t
C_
,
std
::
size_t
H
,
std
::
size_t
W
,
auto
layout
)
{
[](
std
::
size_t
N_
,
std
::
size_t
C_
,
std
::
size_t
H
,
std
::
size_t
W
,
auto
layout
)
{
if
constexpr
(
ck
::
is_same
<
decltype
(
layout
),
ck
::
tensor_layout
::
convolution
::
NCHW
>
::
value
)
if
constexpr
(
ck
::
is_same
_v
<
decltype
(
layout
),
ck
::
tensor_layout
::
convolution
::
NCHW
>
)
{
{
return
HostTensorDescriptor
(
std
::
vector
<
std
::
size_t
>
({
N_
,
C_
,
H
,
W
}),
return
HostTensorDescriptor
({
N_
,
C_
,
H
,
W
},
{
C_
*
H
*
W
,
H
*
W
,
W
,
1
_uz
});
std
::
vector
<
std
::
size_t
>
({
C_
*
H
*
W
,
H
*
W
,
W
,
1
}));
}
}
else
if
constexpr
(
ck
::
is_same
<
decltype
(
layout
),
else
if
constexpr
(
ck
::
is_same_v
<
decltype
(
layout
),
ck
::
tensor_layout
::
convolution
::
NHWC
>
)
ck
::
tensor_layout
::
convolution
::
NHWC
>::
value
)
{
{
return
HostTensorDescriptor
(
std
::
vector
<
std
::
size_t
>
({
N_
,
C_
,
H
,
W
}),
return
HostTensorDescriptor
({
N_
,
C_
,
H
,
W
},
{
C_
*
H
*
W
,
1
_uz
,
W
*
C_
,
C_
});
std
::
vector
<
std
::
size_t
>
({
C_
*
H
*
W
,
1
,
W
*
C_
,
C_
}));
}
}
};
};
...
@@ -193,8 +193,8 @@ bool pool_test(bool do_verification,
...
@@ -193,8 +193,8 @@ bool pool_test(bool do_verification,
Tensor
<
IndexDataType
>
out_indices_n_c_ho_wo_device
(
Tensor
<
IndexDataType
>
out_indices_n_c_ho_wo_device
(
f_host_tensor_descriptor
(
N
,
C
,
Ho
,
Wo
,
OutLayout
{}));
f_host_tensor_descriptor
(
N
,
C
,
Ho
,
Wo
,
OutLayout
{}));
std
::
cout
<<
"in_n_c_hi_wi: "
<<
in_n_c_hi_wi
.
m
Desc
<<
std
::
endl
;
std
::
cout
<<
"in_n_c_hi_wi: "
<<
in_n_c_hi_wi
.
Get
Desc
()
<<
std
::
endl
;
std
::
cout
<<
"out_n_c_ho_wo: "
<<
out_n_c_ho_wo_host
.
m
Desc
<<
std
::
endl
;
std
::
cout
<<
"out_n_c_ho_wo: "
<<
out_n_c_ho_wo_host
.
Get
Desc
()
<<
std
::
endl
;
switch
(
init_method
)
switch
(
init_method
)
{
{
...
@@ -204,25 +204,24 @@ bool pool_test(bool do_verification,
...
@@ -204,25 +204,24 @@ bool pool_test(bool do_verification,
default:
in_n_c_hi_wi
.
GenerateTensorValue
(
GeneratorTensor_3
<
InDataType
>
{
-
5.0
,
5.0
});
default:
in_n_c_hi_wi
.
GenerateTensorValue
(
GeneratorTensor_3
<
InDataType
>
{
-
5.0
,
5.0
});
}
}
DeviceMem
in_device_buf
(
sizeof
(
InDataType
)
*
in_n_c_hi_wi
.
mDesc
.
GetElementSpace
Size
());
DeviceMem
in_device_buf
(
in_n_c_hi_wi
.
GetMemory
Size
());
DeviceMem
out_device_buf
(
sizeof
(
OutDataType
)
*
DeviceMem
out_device_buf
(
out_n_c_ho_wo_device
.
GetMemorySize
());
out_n_c_ho_wo_device
.
mDesc
.
GetElementSpace
Size
());
DeviceMem
out_indices_device_buf
(
out_indices_n_c_ho_wo_device
.
GetMemory
Size
());
DeviceMem
out_indices_device_buf
(
sizeof
(
IndexDataType
)
*
out_indices
_n_c_h
o
_w
o_device
.
mDesc
.
GetElementSpaceSize
());
in_device_buf
.
ToDevice
(
in
_n_c_h
i
_w
i
.
data
());
in_device_buf
.
ToDevice
(
in_n_c_hi_wi
.
mData
.
data
())
;
using
ck
::
utils
::
to_array
;
auto
pool
=
DevicePoolFwdInstance
{};
auto
pool
=
DevicePoolFwdInstance
{};
auto
invoker_ptr
=
pool
.
MakeInvokerPointer
();
auto
invoker_ptr
=
pool
.
MakeInvokerPointer
();
auto
argument_ptr
=
pool
.
MakeArgumentPointer
(
auto
argument_ptr
=
pool
.
MakeArgumentPointer
(
in_device_buf
.
GetDeviceBuffer
(),
static_cast
<
InDataType
*>
(
in_device_buf
.
GetDeviceBuffer
()),
out_device_buf
.
GetDeviceBuffer
(),
static_cast
<
OutDataType
*>
(
out_device_buf
.
GetDeviceBuffer
()),
out_indices_device_buf
.
GetDeviceBuffer
(),
static_cast
<
IndexDataType
*>
(
out_indices_device_buf
.
GetDeviceBuffer
()),
N
,
N
,
C
,
C
,
std
::
array
<
ck
::
index_t
,
2
>
{
{
Hi
,
Wi
}
}
,
to_array
(
{
Hi
,
Wi
}
)
,
std
::
array
<
ck
::
index_t
,
2
>
{
{
Y
,
X
}
}
,
to_array
(
{
Y
,
X
}
)
,
std
::
array
<
ck
::
index_t
,
2
>
{
{
Ho
,
Wo
}
}
,
to_array
(
{
Ho
,
Wo
}
)
,
window_strides
,
window_strides
,
input_left_pads
,
input_left_pads
,
input_right_pads
);
input_right_pads
);
...
@@ -265,16 +264,16 @@ bool pool_test(bool do_verification,
...
@@ -265,16 +264,16 @@ bool pool_test(bool do_verification,
input_left_pads
,
input_left_pads
,
input_right_pads
);
input_right_pads
);
out_device_buf
.
FromDevice
(
out_n_c_ho_wo_device
.
mData
.
data
());
out_device_buf
.
FromDevice
(
out_n_c_ho_wo_device
.
data
());
pass
=
pass
&&
ck
::
utils
::
check_err
(
out_n_c_ho_wo_device
.
mData
,
out_n_c_ho_wo_host
.
mData
);
pass
=
pass
&&
ck
::
utils
::
check_err
(
out_n_c_ho_wo_device
,
out_n_c_ho_wo_host
);
if
constexpr
(
OutputIndex
)
if
constexpr
(
OutputIndex
)
{
{
out_indices_device_buf
.
FromDevice
(
out_indices_n_c_ho_wo_device
.
mData
.
data
());
out_indices_device_buf
.
FromDevice
(
out_indices_n_c_ho_wo_device
.
data
());
pass
=
pass
&&
ck
::
utils
::
check_err
(
out_indices_n_c_ho_wo_device
.
mData
,
pass
=
pass
&&
out_indices_n_c_ho_wo_host
.
mData
);
ck
::
utils
::
check_err
(
out_indices_n_c_ho_wo_device
,
out_indices_n_c_ho_wo_host
);
};
};
}
}
...
...
example/14_gemm_xdl_requant_relu_requant/gemm_xdl_requant_relu_requant_int8.cpp
View file @
e4e99a49
// SPDX-License-Identifier: MIT
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <cstdlib>
#include <iostream>
#include <iostream>
#include <numeric>
#include <numeric>
#include <initializer_list>
#include <initializer_list>
#include <cstdlib>
#include "ck/ck.hpp"
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
...
@@ -12,11 +12,12 @@
...
@@ -12,11 +12,12 @@
#include "ck/tensor_operation/gpu/device/device_gemm_xdl_cshuffle.hpp"
#include "ck/tensor_operation/gpu/device/device_gemm_xdl_cshuffle.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
#include "ck/library/utility/literals.hpp"
#include "ck/library/utility/check_err.hpp"
struct
RequantReluRequant
struct
RequantReluRequant
{
{
...
@@ -155,17 +156,17 @@ int main(int argc, char* argv[])
...
@@ -155,17 +156,17 @@ int main(int argc, char* argv[])
exit
(
0
);
exit
(
0
);
}
}
using
namespace
ck
::
literals
;
auto
f_host_tensor_descriptor
=
auto
f_host_tensor_descriptor
=
[](
std
::
size_t
row
,
std
::
size_t
col
,
std
::
size_t
stride
,
auto
layout
)
{
[](
std
::
size_t
row
,
std
::
size_t
col
,
std
::
size_t
stride
,
auto
layout
)
{
if
(
std
::
is_same
<
decltype
(
layout
),
ck
::
tensor_layout
::
gemm
::
RowMajor
>
::
value
)
if
constexpr
(
std
::
is_same
_v
<
decltype
(
layout
),
ck
::
tensor_layout
::
gemm
::
RowMajor
>
)
{
{
return
HostTensorDescriptor
(
std
::
vector
<
std
::
size_t
>
({
row
,
col
}),
return
HostTensorDescriptor
({
row
,
col
},
{
stride
,
1
_uz
});
std
::
vector
<
std
::
size_t
>
({
stride
,
1
}));
}
}
else
else
{
{
return
HostTensorDescriptor
(
std
::
vector
<
std
::
size_t
>
({
row
,
col
}),
return
HostTensorDescriptor
({
row
,
col
},
{
1
_uz
,
stride
});
std
::
vector
<
std
::
size_t
>
({
1
,
stride
}));
}
}
};
};
...
@@ -174,9 +175,9 @@ int main(int argc, char* argv[])
...
@@ -174,9 +175,9 @@ int main(int argc, char* argv[])
Tensor
<
CDataType
>
c_m_n_host_result
(
f_host_tensor_descriptor
(
M
,
N
,
StrideC
,
CLayout
{}));
Tensor
<
CDataType
>
c_m_n_host_result
(
f_host_tensor_descriptor
(
M
,
N
,
StrideC
,
CLayout
{}));
Tensor
<
CDataType
>
c_m_n_device_result
(
f_host_tensor_descriptor
(
M
,
N
,
StrideC
,
CLayout
{}));
Tensor
<
CDataType
>
c_m_n_device_result
(
f_host_tensor_descriptor
(
M
,
N
,
StrideC
,
CLayout
{}));
std
::
cout
<<
"a_m_k: "
<<
a_m_k
.
m
Desc
<<
std
::
endl
;
std
::
cout
<<
"a_m_k: "
<<
a_m_k
.
Get
Desc
()
<<
std
::
endl
;
std
::
cout
<<
"b_k_n: "
<<
b_k_n
.
m
Desc
<<
std
::
endl
;
std
::
cout
<<
"b_k_n: "
<<
b_k_n
.
Get
Desc
()
<<
std
::
endl
;
std
::
cout
<<
"c_m_n: "
<<
c_m_n_host_result
.
m
Desc
<<
std
::
endl
;
std
::
cout
<<
"c_m_n: "
<<
c_m_n_host_result
.
Get
Desc
()
<<
std
::
endl
;
switch
(
init_method
)
switch
(
init_method
)
{
{
...
@@ -190,12 +191,12 @@ int main(int argc, char* argv[])
...
@@ -190,12 +191,12 @@ int main(int argc, char* argv[])
b_k_n
.
GenerateTensorValue
(
GeneratorTensor_3
<
BDataType
>
{
-
0.5
,
0.5
});
b_k_n
.
GenerateTensorValue
(
GeneratorTensor_3
<
BDataType
>
{
-
0.5
,
0.5
});
}
}
DeviceMem
a_m_k_device_buf
(
sizeof
(
ADataType
)
*
a_m_k
.
mDesc
.
GetElementSpace
Size
());
DeviceMem
a_m_k_device_buf
(
a_m_k
.
GetMemory
Size
());
DeviceMem
b_k_n_device_buf
(
sizeof
(
BDataType
)
*
b_k_n
.
mDesc
.
GetElementSpace
Size
());
DeviceMem
b_k_n_device_buf
(
b_k_n
.
GetMemory
Size
());
DeviceMem
c_m_n_device_buf
(
sizeof
(
CDataType
)
*
c_m_n_device_result
.
mDesc
.
GetElementSpace
Size
());
DeviceMem
c_m_n_device_buf
(
c_m_n_device_result
.
GetMemory
Size
());
a_m_k_device_buf
.
ToDevice
(
a_m_k
.
mData
.
data
());
a_m_k_device_buf
.
ToDevice
(
a_m_k
.
data
());
b_k_n_device_buf
.
ToDevice
(
b_k_n
.
mData
.
data
());
b_k_n_device_buf
.
ToDevice
(
b_k_n
.
data
());
auto
a_element_op
=
PassThrough
{};
auto
a_element_op
=
PassThrough
{};
auto
b_element_op
=
PassThrough
{};
auto
b_element_op
=
PassThrough
{};
...
@@ -204,9 +205,9 @@ int main(int argc, char* argv[])
...
@@ -204,9 +205,9 @@ int main(int argc, char* argv[])
// do GEMM
// do GEMM
auto
gemm
=
DeviceGemmInstance
{};
auto
gemm
=
DeviceGemmInstance
{};
auto
invoker
=
gemm
.
MakeInvoker
();
auto
invoker
=
gemm
.
MakeInvoker
();
auto
argument
=
gemm
.
MakeArgument
(
static_cast
<
ADataType
*>
(
a_m_k_device_buf
.
GetDeviceBuffer
()
)
,
auto
argument
=
gemm
.
MakeArgument
(
a_m_k_device_buf
.
GetDeviceBuffer
(),
static_cast
<
BDataType
*>
(
b_k_n_device_buf
.
GetDeviceBuffer
()
)
,
b_k_n_device_buf
.
GetDeviceBuffer
(),
static_cast
<
CDataType
*>
(
c_m_n_device_buf
.
GetDeviceBuffer
()
)
,
c_m_n_device_buf
.
GetDeviceBuffer
(),
M
,
M
,
N
,
N
,
K
,
K
,
...
@@ -237,7 +238,7 @@ int main(int argc, char* argv[])
...
@@ -237,7 +238,7 @@ int main(int argc, char* argv[])
std
::
cout
<<
"Perf: "
<<
ave_time
<<
" ms, "
<<
tflops
<<
" TFlops, "
<<
gb_per_sec
<<
" GB/s, "
std
::
cout
<<
"Perf: "
<<
ave_time
<<
" ms, "
<<
tflops
<<
" TFlops, "
<<
gb_per_sec
<<
" GB/s, "
<<
gemm
.
GetTypeString
()
<<
std
::
endl
;
<<
gemm
.
GetTypeString
()
<<
std
::
endl
;
c_m_n_device_buf
.
FromDevice
(
c_m_n_device_result
.
mData
.
data
());
c_m_n_device_buf
.
FromDevice
(
c_m_n_device_result
.
data
());
if
(
do_verification
)
if
(
do_verification
)
{
{
...
@@ -249,7 +250,7 @@ int main(int argc, char* argv[])
...
@@ -249,7 +250,7 @@ int main(int argc, char* argv[])
ref_invoker
.
Run
(
ref_argument
);
ref_invoker
.
Run
(
ref_argument
);
return
ck
::
utils
::
check_err
(
c_m_n_device_result
.
mData
,
c_m_n_host_result
.
mData
)
?
0
:
1
;
return
ck
::
utils
::
check_err
(
c_m_n_device_result
,
c_m_n_host_result
)
?
0
:
1
;
}
}
return
0
;
return
0
;
...
...
example/15_grouped_gemm/common.hpp
0 → 100644
View file @
e4e99a49
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <cstdlib>
#include <initializer_list>
#include <iostream>
#include <numeric>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/device_grouped_gemm_xdl.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/literals.hpp"
template
<
ck
::
index_t
...
Is
>
using
S
=
ck
::
Sequence
<
Is
...
>
;
using
BF16
=
ck
::
bhalf_t
;
using
F16
=
ck
::
half_t
;
using
F32
=
float
;
using
Row
=
ck
::
tensor_layout
::
gemm
::
RowMajor
;
using
Col
=
ck
::
tensor_layout
::
gemm
::
ColumnMajor
;
using
PassThrough
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
example/15_grouped_gemm/grouped_gemm_xdl_bfp16.cpp
View file @
e4e99a49
// SPDX-License-Identifier: MIT
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream>
#include "common.hpp"
#include <numeric>
#include <initializer_list>
#include <cstdlib>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "ck/tensor_operation/gpu/device/device_grouped_gemm_xdl.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
template
<
ck
::
index_t
...
Is
>
using
S
=
ck
::
Sequence
<
Is
...
>
;
using
BF16
=
ck
::
bhalf_t
;
using
F32
=
float
;
using
Row
=
ck
::
tensor_layout
::
gemm
::
RowMajor
;
using
Col
=
ck
::
tensor_layout
::
gemm
::
ColumnMajor
;
using
PassThrough
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
ADataType
=
BF16
;
using
ADataType
=
BF16
;
using
BDataType
=
BF16
;
using
BDataType
=
BF16
;
...
...
example/15_grouped_gemm/grouped_gemm_xdl_fp16.cpp
View file @
e4e99a49
// SPDX-License-Identifier: MIT
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream>
#include "common.hpp"
#include <numeric>
#include <initializer_list>
#include <cstdlib>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "ck/tensor_operation/gpu/device/device_grouped_gemm_xdl.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
template
<
ck
::
index_t
...
Is
>
using
S
=
ck
::
Sequence
<
Is
...
>
;
using
F16
=
ck
::
half_t
;
using
F32
=
float
;
using
Row
=
ck
::
tensor_layout
::
gemm
::
RowMajor
;
using
Col
=
ck
::
tensor_layout
::
gemm
::
ColumnMajor
;
using
PassThrough
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
ADataType
=
F16
;
using
ADataType
=
F16
;
using
BDataType
=
F16
;
using
BDataType
=
F16
;
...
...
example/15_grouped_gemm/grouped_gemm_xdl_fp32.cpp
View file @
e4e99a49
// SPDX-License-Identifier: MIT
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream>
#include "common.hpp"
#include <numeric>
#include <initializer_list>
#include <cstdlib>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "ck/tensor_operation/gpu/device/device_grouped_gemm_xdl.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
template
<
ck
::
index_t
...
Is
>
using
S
=
ck
::
Sequence
<
Is
...
>
;
using
F16
=
ck
::
half_t
;
using
F32
=
float
;
using
Row
=
ck
::
tensor_layout
::
gemm
::
RowMajor
;
using
Col
=
ck
::
tensor_layout
::
gemm
::
ColumnMajor
;
using
PassThrough
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
ADataType
=
F32
;
using
ADataType
=
F32
;
using
BDataType
=
F32
;
using
BDataType
=
F32
;
...
...
example/15_grouped_gemm/grouped_gemm_xdl_int4.cpp
View file @
e4e99a49
// SPDX-License-Identifier: MIT
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream>
#include "common.hpp"
#include <numeric>
#include <initializer_list>
#include <cstdlib>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "ck/tensor_operation/gpu/device/device_grouped_gemm_xdl.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
template
<
ck
::
index_t
...
Is
>
using
S
=
ck
::
Sequence
<
Is
...
>
;
using
Row
=
ck
::
tensor_layout
::
gemm
::
RowMajor
;
using
Col
=
ck
::
tensor_layout
::
gemm
::
ColumnMajor
;
using
PassThrough
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
ADataType
=
ck
::
int4_t
;
using
ADataType
=
ck
::
int4_t
;
using
BDataType
=
ck
::
int4_t
;
using
BDataType
=
ck
::
int4_t
;
...
...
example/15_grouped_gemm/grouped_gemm_xdl_int8.cpp
View file @
e4e99a49
// SPDX-License-Identifier: MIT
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream>
#include "common.hpp"
#include <numeric>
#include <initializer_list>
#include <cstdlib>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "ck/tensor_operation/gpu/device/device_grouped_gemm_xdl.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
template
<
ck
::
index_t
...
Is
>
using
S
=
ck
::
Sequence
<
Is
...
>
;
using
Row
=
ck
::
tensor_layout
::
gemm
::
RowMajor
;
using
Col
=
ck
::
tensor_layout
::
gemm
::
ColumnMajor
;
using
PassThrough
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
ADataType
=
int8_t
;
using
ADataType
=
int8_t
;
using
BDataType
=
int8_t
;
using
BDataType
=
int8_t
;
...
...
example/15_grouped_gemm/run_grouped_gemm_example.inc
View file @
e4e99a49
...
@@ -50,17 +50,17 @@ bool run_grouped_gemm(const ProblemSize& problem_size, const ExecutionConfig& co
...
@@ -50,17 +50,17 @@ bool run_grouped_gemm(const ProblemSize& problem_size, const ExecutionConfig& co
gemm_descs
.
push_back
({
M
,
N
,
K
,
stride_A
,
stride_B
,
stride_C
,
{}});
gemm_descs
.
push_back
({
M
,
N
,
K
,
stride_A
,
stride_B
,
stride_C
,
{}});
}
}
using
namespace
ck
::
literals
;
auto
f_host_tensor_descriptor
=
auto
f_host_tensor_descriptor
=
[](
std
::
size_t
row
,
std
::
size_t
col
,
std
::
size_t
stride
,
auto
layout
)
{
[](
std
::
size_t
row
,
std
::
size_t
col
,
std
::
size_t
stride
,
auto
layout
)
{
if
(
std
::
is_same
<
decltype
(
layout
),
ck
::
tensor_layout
::
gemm
::
RowMajor
>
::
value
)
if
constexpr
(
std
::
is_same
_v
<
decltype
(
layout
),
ck
::
tensor_layout
::
gemm
::
RowMajor
>
)
{
{
return
HostTensorDescriptor
(
std
::
vector
<
std
::
size_t
>
({
row
,
col
}),
return
HostTensorDescriptor
({
row
,
col
},
{
stride
,
1_
uz
});
std
::
vector
<
std
::
size_t
>
({
stride
,
1
}));
}
}
else
else
{
{
return
HostTensorDescriptor
(
std
::
vector
<
std
::
size_t
>
({
row
,
col
}),
return
HostTensorDescriptor
({
row
,
col
},
{
1_
uz
,
stride
});
std
::
vector
<
std
::
size_t
>
({
1
,
stride
}));
}
}
};
};
...
@@ -90,27 +90,27 @@ bool run_grouped_gemm(const ProblemSize& problem_size, const ExecutionConfig& co
...
@@ -90,27 +90,27 @@ bool run_grouped_gemm(const ProblemSize& problem_size, const ExecutionConfig& co
for
(
std
::
size_t
i
=
0
;
i
<
gemm_descs
.
size
();
i
++
)
for
(
std
::
size_t
i
=
0
;
i
<
gemm_descs
.
size
();
i
++
)
{
{
a_tensors
.
push_back
(
Tensor
<
ADataType
>
(
f_host_tensor_descriptor
(
a_tensors
.
emplace_back
(
f_host_tensor_descriptor
(
gemm_descs
[
i
]
.
M_
,
gemm_descs
[
i
]
.
K_
,
gemm_descs
[
i
]
.
stride_A_
,
ALayout
{}))
)
;
gemm_descs
[
i
]
.
M_
,
gemm_descs
[
i
]
.
K_
,
gemm_descs
[
i
]
.
stride_A_
,
ALayout
{}));
b_tensors
.
push_back
(
Tensor
<
BDataType
>
(
f_host_tensor_descriptor
(
b_tensors
.
emplace_back
(
f_host_tensor_descriptor
(
gemm_descs
[
i
]
.
K_
,
gemm_descs
[
i
]
.
N_
,
gemm_descs
[
i
]
.
stride_B_
,
BLayout
{}))
)
;
gemm_descs
[
i
]
.
K_
,
gemm_descs
[
i
]
.
N_
,
gemm_descs
[
i
]
.
stride_B_
,
BLayout
{}));
c_host_tensors
.
push_back
(
Tensor
<
EDataType
>
(
f_host_tensor_descriptor
(
c_host_tensors
.
emplace_back
(
f_host_tensor_descriptor
(
gemm_descs
[
i
]
.
M_
,
gemm_descs
[
i
]
.
N_
,
gemm_descs
[
i
]
.
stride_C_
,
ELayout
{}))
)
;
gemm_descs
[
i
]
.
M_
,
gemm_descs
[
i
]
.
N_
,
gemm_descs
[
i
]
.
stride_C_
,
ELayout
{}));
#ifdef BUILD_INT4_EXAMPLE
#ifdef BUILD_INT4_EXAMPLE
c_device_tensors
.
push_back
(
Tensor
<
KernelEDataType
>
(
f_host_tensor_descriptor
(
c_device_tensors
.
emplace_back
(
f_host_tensor_descriptor
(
gemm_descs
[
i
]
.
M_
,
gemm_descs
[
i
]
.
N_
,
gemm_descs
[
i
]
.
stride_C_
,
ELayout
{}))
)
;
gemm_descs
[
i
]
.
M_
,
gemm_descs
[
i
]
.
N_
,
gemm_descs
[
i
]
.
stride_C_
,
ELayout
{}));
#else
#else
c_device_tensors
.
push_back
(
Tensor
<
EDataType
>
(
f_host_tensor_descriptor
(
c_device_tensors
.
emplace_back
(
f_host_tensor_descriptor
(
gemm_descs
[
i
]
.
M_
,
gemm_descs
[
i
]
.
N_
,
gemm_descs
[
i
]
.
stride_C_
,
ELayout
{}))
)
;
gemm_descs
[
i
]
.
M_
,
gemm_descs
[
i
]
.
N_
,
gemm_descs
[
i
]
.
stride_C_
,
ELayout
{}));
#endif
#endif
std
::
cout
<<
"gemm["
<<
i
<<
"] a_m_k: "
<<
a_tensors
[
i
]
.
m
Desc
std
::
cout
<<
"gemm["
<<
i
<<
"] a_m_k: "
<<
a_tensors
[
i
]
.
Get
Desc
()
<<
" b_k_n: "
<<
b_tensors
[
i
]
.
mDesc
<<
" c_m_n: "
<<
c_device_tensors
[
i
]
.
m
Desc
<<
" b_k_n: "
<<
b_tensors
[
i
]
.
Get
Desc
()
<<
std
::
endl
;
<<
" c_m_n: "
<<
c_device_tensors
[
i
]
.
GetDesc
()
<<
std
::
endl
;
flop
+=
std
::
size_t
(
2
)
*
gemm_descs
[
i
]
.
M_
*
gemm_descs
[
i
]
.
K_
*
gemm_descs
[
i
]
.
N_
;
flop
+=
std
::
size_t
(
2
)
*
gemm_descs
[
i
]
.
M_
*
gemm_descs
[
i
]
.
K_
*
gemm_descs
[
i
]
.
N_
;
num_btype
+=
sizeof
(
ADataType
)
*
a_tensors
[
i
]
.
mDesc
.
GetElementSize
()
+
num_btype
+=
sizeof
(
ADataType
)
*
a_tensors
[
i
]
.
GetElementSize
()
+
sizeof
(
BDataType
)
*
b_tensors
[
i
]
.
mDesc
.
GetElementSize
()
+
sizeof
(
BDataType
)
*
b_tensors
[
i
]
.
GetElementSize
()
+
sizeof
(
EDataType
)
*
c_device_tensors
[
i
]
.
mDesc
.
GetElementSize
();
sizeof
(
EDataType
)
*
c_device_tensors
[
i
]
.
GetElementSize
();
switch
(
config
.
init_method
)
switch
(
config
.
init_method
)
{
{
...
@@ -131,22 +131,20 @@ bool run_grouped_gemm(const ProblemSize& problem_size, const ExecutionConfig& co
...
@@ -131,22 +131,20 @@ bool run_grouped_gemm(const ProblemSize& problem_size, const ExecutionConfig& co
for
(
std
::
size_t
i
=
0
;
i
<
gemm_descs
.
size
();
i
++
)
for
(
std
::
size_t
i
=
0
;
i
<
gemm_descs
.
size
();
i
++
)
{
{
a_tensors_device
.
emplace_back
(
std
::
make_unique
<
DeviceMem
>
(
a_tensors_device
.
emplace_back
(
std
::
make_unique
<
DeviceMem
>
(
a_tensors
[
i
]
.
GetMemorySize
()));
sizeof
(
ADataType
)
*
a_tensors
[
i
]
.
mDesc
.
GetElementSpaceSize
()));
b_tensors_device
.
emplace_back
(
std
::
make_unique
<
DeviceMem
>
(
b_tensors
[
i
]
.
GetMemorySize
()));
b_tensors_device
.
emplace_back
(
std
::
make_unique
<
DeviceMem
>
(
c_tensors_device
.
emplace_back
(
sizeof
(
BDataType
)
*
b_tensors
[
i
]
.
mDesc
.
GetElementSpaceSize
()));
std
::
make_unique
<
DeviceMem
>
(
c_device_tensors
[
i
]
.
GetMemorySize
()));
c_tensors_device
.
emplace_back
(
std
::
make_unique
<
DeviceMem
>
(
sizeof
(
EDataType
)
*
c_device_tensors
[
i
]
.
mDesc
.
GetElementSpaceSize
()));
#ifdef BUILD_INT4_EXAMPLE
#ifdef BUILD_INT4_EXAMPLE
const
Tensor
<
KernelADataType
>
a_converted
(
a_tensors
[
i
]);
const
Tensor
<
KernelADataType
>
a_converted
(
a_tensors
[
i
]);
const
Tensor
<
KernelBDataType
>
b_converted
(
b_tensors
[
i
]);
const
Tensor
<
KernelBDataType
>
b_converted
(
b_tensors
[
i
]);
a_tensors_device
[
i
]
->
ToDevice
(
a_converted
.
mData
.
data
());
a_tensors_device
[
i
]
->
ToDevice
(
a_converted
.
data
());
b_tensors_device
[
i
]
->
ToDevice
(
b_converted
.
mData
.
data
());
b_tensors_device
[
i
]
->
ToDevice
(
b_converted
.
data
());
#else
#else
a_tensors_device
[
i
]
->
ToDevice
(
a_tensors
[
i
]
.
mData
.
data
());
a_tensors_device
[
i
]
->
ToDevice
(
a_tensors
[
i
]
.
data
());
b_tensors_device
[
i
]
->
ToDevice
(
b_tensors
[
i
]
.
mData
.
data
());
b_tensors_device
[
i
]
->
ToDevice
(
b_tensors
[
i
]
.
data
());
#endif
#endif
p_a
.
push_back
(
a_tensors_device
[
i
]
->
GetDeviceBuffer
());
p_a
.
push_back
(
a_tensors_device
[
i
]
->
GetDeviceBuffer
());
...
@@ -193,7 +191,7 @@ bool run_grouped_gemm(const ProblemSize& problem_size, const ExecutionConfig& co
...
@@ -193,7 +191,7 @@ bool run_grouped_gemm(const ProblemSize& problem_size, const ExecutionConfig& co
for
(
std
::
size_t
i
=
0
;
i
<
gemm_descs
.
size
();
i
++
)
for
(
std
::
size_t
i
=
0
;
i
<
gemm_descs
.
size
();
i
++
)
{
{
c_tensors_device
[
i
]
->
FromDevice
(
c_device_tensors
[
i
]
.
mData
.
data
());
c_tensors_device
[
i
]
->
FromDevice
(
c_device_tensors
[
i
]
.
data
());
auto
ref_gemm
=
ReferenceGemmInstance
{};
auto
ref_gemm
=
ReferenceGemmInstance
{};
auto
ref_invoker
=
ref_gemm
.
MakeInvoker
();
auto
ref_invoker
=
ref_gemm
.
MakeInvoker
();
...
@@ -208,10 +206,10 @@ bool run_grouped_gemm(const ProblemSize& problem_size, const ExecutionConfig& co
...
@@ -208,10 +206,10 @@ bool run_grouped_gemm(const ProblemSize& problem_size, const ExecutionConfig& co
#ifdef BUILD_INT4_EXAMPLE
#ifdef BUILD_INT4_EXAMPLE
const
Tensor
<
EDataType
>
c_device_result_converted
(
c_device_tensors
[
i
]);
const
Tensor
<
EDataType
>
c_device_result_converted
(
c_device_tensors
[
i
]);
pass
&=
ck
::
utils
::
check_err
(
c_device_result_converted
.
mData
,
c_host_tensors
[
i
]
.
mData
);
pass
&=
ck
::
utils
::
check_err
(
c_device_result_converted
,
c_host_tensors
[
i
]);
#else
#else
pass
&=
ck
::
utils
::
check_err
(
c_device_tensors
[
i
]
.
mData
,
c_host_tensors
[
i
]
.
mData
);
pass
&=
ck
::
utils
::
check_err
(
c_device_tensors
[
i
],
c_host_tensors
[
i
]);
#endif
#endif
}
}
}
}
...
...
Prev
1
2
3
4
5
…
8
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment