Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
dd6a8de4
Commit
dd6a8de4
authored
Apr 06, 2022
by
Jehandad Khan
Browse files
Merge branch 'develop' into jd/dev_pkg
parents
0aa899aa
abf4bdb9
Changes
470
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
1868 additions
and
930 deletions
+1868
-930
test/conv2d_fwd/CMakeLists.txt
test/conv2d_fwd/CMakeLists.txt
+0
-3
test/conv2d_fwd/conv2d_fwd.cpp
test/conv2d_fwd/conv2d_fwd.cpp
+0
-308
test/conv_util/conv_util.cpp
test/conv_util/conv_util.cpp
+115
-68
test/convnd_bwd_data/CMakeLists.txt
test/convnd_bwd_data/CMakeLists.txt
+8
-0
test/convnd_bwd_data/convnd_bwd_data.cpp
test/convnd_bwd_data/convnd_bwd_data.cpp
+330
-0
test/convnd_fwd/CMakeLists.txt
test/convnd_fwd/CMakeLists.txt
+17
-2
test/convnd_fwd/conv1d_fwd.cpp
test/convnd_fwd/conv1d_fwd.cpp
+156
-0
test/convnd_fwd/conv2d_fwd.cpp
test/convnd_fwd/conv2d_fwd.cpp
+150
-0
test/convnd_fwd/conv3d_fwd.cpp
test/convnd_fwd/conv3d_fwd.cpp
+300
-0
test/convnd_fwd/conv_util.hpp
test/convnd_fwd/conv_util.hpp
+90
-0
test/convnd_fwd/convnd_fwd.cpp
test/convnd_fwd/convnd_fwd.cpp
+0
-262
test/gemm/CMakeLists.txt
test/gemm/CMakeLists.txt
+4
-0
test/gemm/gemm_bf16.cpp
test/gemm/gemm_bf16.cpp
+62
-109
test/gemm/gemm_fp16.cpp
test/gemm/gemm_fp16.cpp
+155
-0
test/gemm/gemm_fp32.cpp
test/gemm/gemm_fp32.cpp
+103
-87
test/gemm/gemm_int8.cpp
test/gemm/gemm_int8.cpp
+81
-86
test/gemm/gemm_util.hpp
test/gemm/gemm_util.hpp
+237
-0
test/gemm_reduce/CMakeLists.txt
test/gemm_reduce/CMakeLists.txt
+9
-0
test/gemm_reduce/gemm_reduce_fp16.cpp
test/gemm_reduce/gemm_reduce_fp16.cpp
+46
-0
test/gemm_split_k/gemm_split_k.cpp
test/gemm_split_k/gemm_split_k.cpp
+5
-5
No files found.
test/conv2d_fwd/CMakeLists.txt
deleted
100644 → 0
View file @
0aa899aa
add_test_executable
(
test_conv2d_fwd conv2d_fwd.cpp
)
target_link_libraries
(
test_conv2d_fwd PRIVATE host_tensor
)
target_link_libraries
(
test_conv2d_fwd PRIVATE device_conv2d_fwd_instance
)
test/conv2d_fwd/conv2d_fwd.cpp
deleted
100644 → 0
View file @
0aa899aa
#include "config.hpp"
#include "device.hpp"
#include "host_tensor.hpp"
#include "host_tensor_generator.hpp"
#include "host_conv.hpp"
#include "tensor_layout.hpp"
#include "device_tensor.hpp"
#include "device_conv_fwd.hpp"
#include "element_wise_operation.hpp"
#include "reference_conv_fwd.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
device_conv2d_fwd_instance
{
using
DeviceConvFwdNoOpPtr
=
DeviceConvFwdPtr
<
ck
::
tensor_operation
::
element_wise
::
PassThrough
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
>
;
void
add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f32_instances
(
std
::
vector
<
DeviceConvFwdNoOpPtr
>&
);
void
add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f16_instances
(
std
::
vector
<
DeviceConvFwdNoOpPtr
>&
);
void
add_device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk_f16_instances
(
std
::
vector
<
DeviceConvFwdNoOpPtr
>&
);
void
add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_bf16_instances
(
std
::
vector
<
DeviceConvFwdNoOpPtr
>&
);
void
add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_int8_instances
(
std
::
vector
<
DeviceConvFwdNoOpPtr
>&
);
}
// namespace device_conv2d_fwd_instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
using
InElementOp
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
WeiElementOp
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
OutElementOp
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
template
<
typename
T
>
static
bool
check_out
(
const
Tensor
<
T
>&
ref
,
const
Tensor
<
T
>&
result
)
{
float
max_diff
=
1e-6
;
for
(
int
i
=
0
;
i
<
ref
.
mData
.
size
();
++
i
)
{
float
diff
=
std
::
abs
(
double
(
ref
.
mData
[
i
])
-
double
(
result
.
mData
[
i
]));
if
(
max_diff
<
diff
)
{
return
false
;
}
}
return
true
;
}
int
main
(
int
argc
,
char
*
argv
[])
{
int
data_type
=
0
;
int
init_method
=
0
;
// Conv shape
ck
::
index_t
N
=
128
;
ck
::
index_t
K
=
256
;
ck
::
index_t
C
=
192
;
ck
::
index_t
Y
=
3
;
ck
::
index_t
X
=
3
;
ck
::
index_t
Hi
=
71
;
ck
::
index_t
Wi
=
71
;
ck
::
index_t
conv_stride_h
=
2
;
ck
::
index_t
conv_stride_w
=
2
;
ck
::
index_t
conv_dilation_h
=
1
;
ck
::
index_t
conv_dilation_w
=
1
;
ck
::
index_t
in_left_pad_h
=
1
;
ck
::
index_t
in_left_pad_w
=
1
;
ck
::
index_t
in_right_pad_h
=
1
;
ck
::
index_t
in_right_pad_w
=
1
;
if
(
argc
==
1
)
{
data_type
=
1
;
init_method
=
1
;
}
else
if
(
argc
==
3
)
{
data_type
=
std
::
stoi
(
argv
[
1
]);
init_method
=
std
::
stoi
(
argv
[
2
]);
}
else
if
(
argc
==
18
)
{
data_type
=
std
::
stoi
(
argv
[
1
]);
init_method
=
std
::
stoi
(
argv
[
2
]);
N
=
std
::
stoi
(
argv
[
3
]);
K
=
std
::
stoi
(
argv
[
4
]);
C
=
std
::
stoi
(
argv
[
5
]);
Y
=
std
::
stoi
(
argv
[
6
]);
X
=
std
::
stoi
(
argv
[
7
]);
Hi
=
std
::
stoi
(
argv
[
8
]);
Wi
=
std
::
stoi
(
argv
[
9
]);
conv_stride_h
=
std
::
stoi
(
argv
[
10
]);
conv_stride_w
=
std
::
stoi
(
argv
[
11
]);
conv_dilation_h
=
std
::
stoi
(
argv
[
12
]);
conv_dilation_w
=
std
::
stoi
(
argv
[
13
]);
in_left_pad_h
=
std
::
stoi
(
argv
[
14
]);
in_left_pad_w
=
std
::
stoi
(
argv
[
15
]);
in_right_pad_h
=
std
::
stoi
(
argv
[
16
]);
in_right_pad_w
=
std
::
stoi
(
argv
[
17
]);
}
else
{
printf
(
"arg1: data type (0=fp32, 1=fp16, 2= bfp16, 3= int8_t )
\n
"
);
printf
(
"arg2: initialization (0=no init, 1=integer value, 2=decimal value)
\n
"
);
printf
(
"arg3 to 17: N, K, C, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, "
"RightPx
\n
"
);
exit
(
1
);
}
auto
Run
=
[
&
](
auto
input_type
,
auto
wei_type
,
auto
out_type
)
{
using
InDataType
=
decltype
(
input_type
);
using
WeiDataType
=
decltype
(
wei_type
);
using
OutDataType
=
decltype
(
out_type
);
using
ReferenceConvFwdInstance
=
ck
::
tensor_operation
::
host
::
ReferenceConvFwd
<
InDataType
,
WeiDataType
,
OutDataType
,
InElementOp
,
WeiElementOp
,
OutElementOp
>
;
const
ck
::
index_t
YEff
=
(
Y
-
1
)
*
conv_dilation_h
+
1
;
const
ck
::
index_t
XEff
=
(
X
-
1
)
*
conv_dilation_w
+
1
;
const
ck
::
index_t
Ho
=
(
Hi
+
in_left_pad_h
+
in_right_pad_h
-
YEff
)
/
conv_stride_h
+
1
;
const
ck
::
index_t
Wo
=
(
Wi
+
in_left_pad_w
+
in_right_pad_w
-
XEff
)
/
conv_stride_w
+
1
;
const
std
::
vector
<
ck
::
index_t
>
input_spatial_lengths
{
Hi
,
Wi
};
const
std
::
vector
<
ck
::
index_t
>
filter_spatial_lengths
{
Y
,
X
};
const
std
::
vector
<
ck
::
index_t
>
output_spatial_lengths
{
Ho
,
Wo
};
const
std
::
vector
<
ck
::
index_t
>
conv_filter_strides
{
conv_stride_h
,
conv_stride_w
};
const
std
::
vector
<
ck
::
index_t
>
conv_filter_dilations
{
conv_dilation_h
,
conv_dilation_w
};
const
std
::
vector
<
ck
::
index_t
>
input_left_pads
{
in_left_pad_h
,
in_left_pad_w
};
const
std
::
vector
<
ck
::
index_t
>
input_right_pads
{
in_right_pad_h
,
in_right_pad_w
};
auto
f_host_tensor_descriptor
=
[](
std
::
size_t
N_
,
std
::
size_t
C_
,
std
::
size_t
H
,
std
::
size_t
W
)
{
return
HostTensorDescriptor
(
std
::
vector
<
std
::
size_t
>
({
N_
,
C_
,
H
,
W
}),
std
::
vector
<
std
::
size_t
>
({
C_
*
H
*
W
,
1
,
W
*
C_
,
C_
}));
};
Tensor
<
InDataType
>
in_n_c_hi_wi
(
f_host_tensor_descriptor
(
N
,
C
,
Hi
,
Wi
));
Tensor
<
WeiDataType
>
wei_k_c_y_x
(
f_host_tensor_descriptor
(
K
,
C
,
Y
,
X
));
Tensor
<
OutDataType
>
out_n_k_ho_wo_host_result
(
f_host_tensor_descriptor
(
N
,
K
,
Ho
,
Wo
));
Tensor
<
OutDataType
>
out_n_k_ho_wo_device_result
(
f_host_tensor_descriptor
(
N
,
K
,
Ho
,
Wo
));
std
::
cout
<<
"in_n_c_hi_wi: "
<<
in_n_c_hi_wi
.
mDesc
<<
std
::
endl
;
std
::
cout
<<
"wei_k_c_y_x: "
<<
wei_k_c_y_x
.
mDesc
<<
std
::
endl
;
std
::
cout
<<
"out_n_k_ho_wo: "
<<
out_n_k_ho_wo_host_result
.
mDesc
<<
std
::
endl
;
switch
(
init_method
)
{
case
0
:
break
;
case
1
:
in_n_c_hi_wi
.
GenerateTensorValue
(
GeneratorTensor_2
<
InDataType
>
{
-
5
,
5
});
wei_k_c_y_x
.
GenerateTensorValue
(
GeneratorTensor_2
<
WeiDataType
>
{
-
5
,
5
});
break
;
default:
in_n_c_hi_wi
.
GenerateTensorValue
(
GeneratorTensor_3
<
InDataType
>
{
0
,
1
});
wei_k_c_y_x
.
GenerateTensorValue
(
GeneratorTensor_3
<
WeiDataType
>
{
-
1
,
1
});
}
DeviceMem
in_device_buf
(
sizeof
(
InDataType
)
*
in_n_c_hi_wi
.
mDesc
.
GetElementSpace
());
DeviceMem
wei_device_buf
(
sizeof
(
WeiDataType
)
*
wei_k_c_y_x
.
mDesc
.
GetElementSpace
());
DeviceMem
out_device_buf
(
sizeof
(
OutDataType
)
*
out_n_k_ho_wo_device_result
.
mDesc
.
GetElementSpace
());
in_device_buf
.
ToDevice
(
in_n_c_hi_wi
.
mData
.
data
());
wei_device_buf
.
ToDevice
(
wei_k_c_y_x
.
mData
.
data
());
using
PassThrough
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
DeviceConvFwdNoOpPtr
=
ck
::
tensor_operation
::
device
::
DeviceConvFwdPtr
<
PassThrough
,
PassThrough
,
PassThrough
>
;
// add device Conv instances
std
::
vector
<
DeviceConvFwdNoOpPtr
>
conv_ptrs
;
if
constexpr
(
ck
::
is_same_v
<
ck
::
remove_cv_t
<
InDataType
>
,
float
>
&&
ck
::
is_same_v
<
ck
::
remove_cv_t
<
WeiDataType
>
,
float
>
&&
ck
::
is_same_v
<
ck
::
remove_cv_t
<
OutDataType
>
,
float
>
)
{
ck
::
tensor_operation
::
device
::
device_conv2d_fwd_instance
::
add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f32_instances
(
conv_ptrs
);
}
else
if
constexpr
(
ck
::
is_same_v
<
ck
::
remove_cv_t
<
InDataType
>
,
ck
::
half_t
>
&&
ck
::
is_same_v
<
ck
::
remove_cv_t
<
WeiDataType
>
,
ck
::
half_t
>
&&
ck
::
is_same_v
<
ck
::
remove_cv_t
<
OutDataType
>
,
ck
::
half_t
>
)
{
ck
::
tensor_operation
::
device
::
device_conv2d_fwd_instance
::
add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f16_instances
(
conv_ptrs
);
ck
::
tensor_operation
::
device
::
device_conv2d_fwd_instance
::
add_device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk_f16_instances
(
conv_ptrs
);
}
else
if
constexpr
(
ck
::
is_same_v
<
ck
::
remove_cv_t
<
InDataType
>
,
ck
::
bhalf_t
>
&&
ck
::
is_same_v
<
ck
::
remove_cv_t
<
WeiDataType
>
,
ck
::
bhalf_t
>
&&
ck
::
is_same_v
<
ck
::
remove_cv_t
<
OutDataType
>
,
ck
::
bhalf_t
>
)
{
ck
::
tensor_operation
::
device
::
device_conv2d_fwd_instance
::
add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_bf16_instances
(
conv_ptrs
);
}
else
if
constexpr
(
ck
::
is_same_v
<
ck
::
remove_cv_t
<
InDataType
>
,
int8_t
>
&&
ck
::
is_same_v
<
ck
::
remove_cv_t
<
WeiDataType
>
,
int8_t
>
&&
ck
::
is_same_v
<
ck
::
remove_cv_t
<
OutDataType
>
,
int8_t
>
)
{
ck
::
tensor_operation
::
device
::
device_conv2d_fwd_instance
::
add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_int8_instances
(
conv_ptrs
);
}
if
(
conv_ptrs
.
size
()
<=
0
)
{
throw
std
::
runtime_error
(
"wrong! no device Conv instance found"
);
}
auto
ref_conv
=
ReferenceConvFwdInstance
{};
auto
ref_invoker
=
ref_conv
.
MakeInvoker
();
auto
ref_argument
=
ref_conv
.
MakeArgument
(
in_n_c_hi_wi
,
wei_k_c_y_x
,
out_n_k_ho_wo_host_result
,
conv_filter_strides
,
conv_filter_dilations
,
input_left_pads
,
input_right_pads
,
InElementOp
{},
WeiElementOp
{},
OutElementOp
{});
ref_invoker
.
Run
(
ref_argument
);
// profile device Conv instances
bool
success
=
false
;
for
(
auto
&
conv_ptr
:
conv_ptrs
)
{
auto
argument_ptr
=
conv_ptr
->
MakeArgumentPointer
(
static_cast
<
InDataType
*>
(
in_device_buf
.
GetDeviceBuffer
()),
static_cast
<
WeiDataType
*>
(
wei_device_buf
.
GetDeviceBuffer
()),
static_cast
<
OutDataType
*>
(
out_device_buf
.
GetDeviceBuffer
()),
N
,
K
,
C
,
input_spatial_lengths
,
filter_spatial_lengths
,
output_spatial_lengths
,
conv_filter_strides
,
conv_filter_dilations
,
input_left_pads
,
input_right_pads
,
PassThrough
{},
PassThrough
{},
PassThrough
{});
auto
invoker_ptr
=
conv_ptr
->
MakeInvokerPointer
();
if
(
conv_ptr
->
IsSupportedArgument
(
argument_ptr
.
get
()))
{
invoker_ptr
->
Run
(
argument_ptr
.
get
(),
0
);
out_device_buf
.
FromDevice
(
out_n_k_ho_wo_device_result
.
mData
.
data
());
if
(
!
check_out
(
out_n_k_ho_wo_host_result
,
out_n_k_ho_wo_device_result
))
{
success
=
false
;
break
;
}
success
=
true
;
}
}
if
(
success
)
{
std
::
cout
<<
"test conv2d fwd : Pass"
<<
std
::
endl
;
return
0
;
}
else
{
std
::
cout
<<
"test conv2d fwd: Fail "
<<
std
::
endl
;
return
-
1
;
}
};
int
res
=
-
1
;
if
(
data_type
==
0
)
{
res
=
Run
(
float
(),
float
(),
float
());
}
else
if
(
data_type
==
1
)
{
res
=
Run
(
ck
::
half_t
(),
ck
::
half_t
(),
ck
::
half_t
());
}
else
if
(
data_type
==
2
)
{
Run
(
ck
::
bhalf_t
(),
ck
::
bhalf_t
(),
ck
::
bhalf_t
());
}
else
if
(
data_type
==
3
)
{
res
=
Run
(
int8_t
(),
int8_t
(),
int8_t
());
}
return
res
;
}
test/conv_util/conv_util.cpp
View file @
dd6a8de4
...
...
@@ -3,36 +3,13 @@
#include <vector>
#include "config.hpp"
#include "conv_util
s
.hpp"
#include "conv_
fwd_
util.hpp"
#include "tensor_layout.hpp"
#include "check_err.hpp"
namespace
{
template
<
typename
T
>
bool
cmp_vec
(
const
std
::
vector
<
T
>&
out
,
const
std
::
vector
<
T
>&
ref
,
const
std
::
string
&
msg
)
{
if
(
out
.
size
()
!=
ref
.
size
())
{
std
::
cout
<<
"out.size() != ref.size(), :"
<<
out
.
size
()
<<
" != "
<<
ref
.
size
()
<<
std
::
endl
<<
msg
<<
std
::
endl
;
return
false
;
}
for
(
std
::
size_t
i
=
0
;
i
<
ref
.
size
();
++
i
)
{
if
(
out
[
i
]
!=
ref
[
i
])
{
std
::
cout
<<
"out["
<<
i
<<
"] != ref["
<<
i
<<
"]: "
<<
out
[
i
]
<<
"!="
<<
ref
[
i
]
<<
std
::
endl
<<
msg
<<
std
::
endl
;
return
false
;
}
}
return
true
;
}
bool
TestConvParams_GetOutputSpatialLengths
()
bool
test_conv_params_get_output_spatial_lengths
()
{
bool
res
{
true
};
// -------------------------- default 2D ------------------------------------
...
...
@@ -41,28 +18,28 @@ bool TestConvParams_GetOutputSpatialLengths()
// stride {2,2},
// dilations {1,1},
// padding {{1,1}, {1,1}}
ck
::
conv_util
::
ConvParams
conv_params
;
ck
::
utils
::
conv
::
ConvParams
conv_params
;
std
::
vector
<
ck
::
index_t
>
out_spatial_len
=
conv_params
.
GetOutputSpatialLengths
();
res
=
c
mp_vec
(
out_spatial_len
,
std
::
vector
<
ck
::
index_t
>
{
36
,
36
},
"Error: ConvParams 2D default constructor."
);
res
=
c
k
::
utils
::
check_err
(
out_spatial_len
,
std
::
vector
<
ck
::
index_t
>
{
36
,
36
},
"Error: ConvParams 2D default constructor."
);
conv_params
.
conv_filter_strides
=
std
::
vector
<
ck
::
index_t
>
{
1
,
1
};
out_spatial_len
=
conv_params
.
GetOutputSpatialLengths
();
res
=
c
mp_vec
(
res
=
c
k
::
utils
::
check_err
(
out_spatial_len
,
std
::
vector
<
ck
::
index_t
>
{
71
,
71
},
"Error: ConvParams 2D stride {1,1}."
);
conv_params
.
conv_filter_strides
=
std
::
vector
<
ck
::
index_t
>
{
2
,
2
};
conv_params
.
input_left_pads
=
std
::
vector
<
ck
::
index_t
>
{
2
,
2
};
conv_params
.
input_right_pads
=
std
::
vector
<
ck
::
index_t
>
{
2
,
2
};
out_spatial_len
=
conv_params
.
GetOutputSpatialLengths
();
res
=
c
mp_vec
(
out_spatial_len
,
std
::
vector
<
ck
::
index_t
>
{
37
,
37
},
"Error: ConvParams 2D padding left/right {2,2}."
);
res
=
c
k
::
utils
::
check_err
(
out_spatial_len
,
std
::
vector
<
ck
::
index_t
>
{
37
,
37
},
"Error: ConvParams 2D padding left/right {2,2}."
);
conv_params
.
conv_filter_dilations
=
std
::
vector
<
ck
::
index_t
>
{
2
,
2
};
out_spatial_len
=
conv_params
.
GetOutputSpatialLengths
();
res
=
c
mp_vec
(
res
=
c
k
::
utils
::
check_err
(
out_spatial_len
,
std
::
vector
<
ck
::
index_t
>
{
36
,
36
},
"Error: ConvParams 2D dilation {2,2}."
);
conv_params
.
conv_filter_strides
=
std
::
vector
<
ck
::
index_t
>
{
3
,
3
};
...
...
@@ -70,9 +47,10 @@ bool TestConvParams_GetOutputSpatialLengths()
conv_params
.
input_right_pads
=
std
::
vector
<
ck
::
index_t
>
{
1
,
1
};
conv_params
.
conv_filter_dilations
=
std
::
vector
<
ck
::
index_t
>
{
2
,
2
};
out_spatial_len
=
conv_params
.
GetOutputSpatialLengths
();
res
=
cmp_vec
(
out_spatial_len
,
std
::
vector
<
ck
::
index_t
>
{
23
,
23
},
"Error: ConvParams 2D strides{3,3}, padding {1,1}, dilations {2,2}."
);
res
=
ck
::
utils
::
check_err
(
out_spatial_len
,
std
::
vector
<
ck
::
index_t
>
{
23
,
23
},
"Error: ConvParams 2D strides{3,3}, padding {1,1}, dilations {2,2}."
);
// -------------------------- 1D ------------------------------------
conv_params
.
num_dim_spatial
=
1
;
...
...
@@ -84,25 +62,25 @@ bool TestConvParams_GetOutputSpatialLengths()
conv_params
.
input_right_pads
=
std
::
vector
<
ck
::
index_t
>
{
1
};
out_spatial_len
=
conv_params
.
GetOutputSpatialLengths
();
res
=
c
mp_vec
(
out_spatial_len
,
std
::
vector
<
ck
::
index_t
>
{
36
},
"Error: ConvParams 1D
default constructor
."
);
res
=
c
k
::
utils
::
check_err
(
out_spatial_len
,
std
::
vector
<
ck
::
index_t
>
{
36
},
"Error: ConvParams 1D."
);
conv_params
.
conv_filter_strides
=
std
::
vector
<
ck
::
index_t
>
{
1
,
1
};
conv_params
.
conv_filter_strides
=
std
::
vector
<
ck
::
index_t
>
{
1
};
out_spatial_len
=
conv_params
.
GetOutputSpatialLengths
();
res
=
cmp_vec
(
out_spatial_len
,
std
::
vector
<
ck
::
index_t
>
{
71
},
"Error: ConvParams 1D stride {1}."
);
res
=
ck
::
utils
::
check_err
(
out_spatial_len
,
std
::
vector
<
ck
::
index_t
>
{
71
},
"Error: ConvParams 1D stride {1}."
);
conv_params
.
conv_filter_strides
=
std
::
vector
<
ck
::
index_t
>
{
2
};
conv_params
.
input_left_pads
=
std
::
vector
<
ck
::
index_t
>
{
2
};
conv_params
.
input_right_pads
=
std
::
vector
<
ck
::
index_t
>
{
2
};
out_spatial_len
=
conv_params
.
GetOutputSpatialLengths
();
res
=
c
mp_vec
(
out_spatial_len
,
std
::
vector
<
ck
::
index_t
>
{
37
},
"Error: ConvParams 1D padding left/right {2}."
);
res
=
c
k
::
utils
::
check_err
(
out_spatial_len
,
std
::
vector
<
ck
::
index_t
>
{
37
},
"Error: ConvParams 1D padding left/right {2}."
);
conv_params
.
conv_filter_dilations
=
std
::
vector
<
ck
::
index_t
>
{
2
};
out_spatial_len
=
conv_params
.
GetOutputSpatialLengths
();
res
=
c
mp_vec
(
res
=
c
k
::
utils
::
check_err
(
out_spatial_len
,
std
::
vector
<
ck
::
index_t
>
{
36
},
"Error: ConvParams 1D dilation {2}."
);
conv_params
.
conv_filter_strides
=
std
::
vector
<
ck
::
index_t
>
{
3
};
...
...
@@ -110,36 +88,104 @@ bool TestConvParams_GetOutputSpatialLengths()
conv_params
.
input_right_pads
=
std
::
vector
<
ck
::
index_t
>
{
1
};
conv_params
.
conv_filter_dilations
=
std
::
vector
<
ck
::
index_t
>
{
2
};
out_spatial_len
=
conv_params
.
GetOutputSpatialLengths
();
res
=
cmp_vec
(
out_spatial_len
,
std
::
vector
<
ck
::
index_t
>
{
23
},
"Error: ConvParams 1D strides{3}, padding {1}, dilations {2}."
);
res
=
ck
::
utils
::
check_err
(
out_spatial_len
,
std
::
vector
<
ck
::
index_t
>
{
23
},
"Error: ConvParams 1D strides{3}, padding {1}, dilations {2}."
);
// -------------------------- 3D ------------------------------------
conv_params
.
num_dim_spatial
=
3
;
conv_params
.
filter_spatial_lengths
=
std
::
vector
<
ck
::
index_t
>
{
3
,
3
,
3
};
conv_params
.
input_spatial_lengths
=
std
::
vector
<
ck
::
index_t
>
{
71
,
71
,
71
};
conv_params
.
conv_filter_strides
=
std
::
vector
<
ck
::
index_t
>
{
2
,
2
,
2
};
conv_params
.
conv_filter_dilations
=
std
::
vector
<
ck
::
index_t
>
{
1
,
1
,
1
};
conv_params
.
input_left_pads
=
std
::
vector
<
ck
::
index_t
>
{
1
,
1
,
1
};
conv_params
.
input_right_pads
=
std
::
vector
<
ck
::
index_t
>
{
1
,
1
,
1
};
out_spatial_len
=
conv_params
.
GetOutputSpatialLengths
();
res
=
ck
::
utils
::
check_err
(
out_spatial_len
,
std
::
vector
<
ck
::
index_t
>
{
36
,
36
,
36
},
"Error: ConvParams 3D."
);
conv_params
.
conv_filter_strides
=
std
::
vector
<
ck
::
index_t
>
{
1
,
1
,
1
};
out_spatial_len
=
conv_params
.
GetOutputSpatialLengths
();
res
=
ck
::
utils
::
check_err
(
out_spatial_len
,
std
::
vector
<
ck
::
index_t
>
{
71
,
71
,
71
},
"Error: ConvParams 3D stride {1, 1, 1}."
);
conv_params
.
conv_filter_strides
=
std
::
vector
<
ck
::
index_t
>
{
2
,
2
,
2
};
conv_params
.
input_left_pads
=
std
::
vector
<
ck
::
index_t
>
{
2
,
2
,
2
};
conv_params
.
input_right_pads
=
std
::
vector
<
ck
::
index_t
>
{
2
,
2
,
2
};
out_spatial_len
=
conv_params
.
GetOutputSpatialLengths
();
res
=
ck
::
utils
::
check_err
(
out_spatial_len
,
std
::
vector
<
ck
::
index_t
>
{
37
,
37
,
37
},
"Error: ConvParams 3D padding left/right {2, 2, 2}."
);
conv_params
.
conv_filter_dilations
=
std
::
vector
<
ck
::
index_t
>
{
2
,
2
,
2
};
out_spatial_len
=
conv_params
.
GetOutputSpatialLengths
();
res
=
ck
::
utils
::
check_err
(
out_spatial_len
,
std
::
vector
<
ck
::
index_t
>
{
36
,
36
,
36
},
"Error: ConvParams 3D dilation {2, 2, 2}."
);
conv_params
.
conv_filter_strides
=
std
::
vector
<
ck
::
index_t
>
{
3
,
3
,
3
};
conv_params
.
input_left_pads
=
std
::
vector
<
ck
::
index_t
>
{
1
,
1
,
1
};
conv_params
.
input_right_pads
=
std
::
vector
<
ck
::
index_t
>
{
1
,
1
,
1
};
conv_params
.
conv_filter_dilations
=
std
::
vector
<
ck
::
index_t
>
{
2
,
2
,
2
};
out_spatial_len
=
conv_params
.
GetOutputSpatialLengths
();
res
=
ck
::
utils
::
check_err
(
out_spatial_len
,
std
::
vector
<
ck
::
index_t
>
{
23
,
23
,
23
},
"Error: ConvParams 3D strides{3, 3, 3}, padding {1, 1, 1}, dilations {2, 2, 2}."
);
return
res
;
}
bool
T
est
GetH
ost
T
ensor
D
escriptor
()
bool
t
est
_get_h
ost
_t
ensor
_d
escriptor
()
{
bool
res
{
true
};
namespace
tl
=
ck
::
tensor_layout
::
convolution
;
std
::
vector
<
std
::
size_t
>
dims
{
2
,
3
,
4
,
5
};
HostTensorDescriptor
h
=
ck
::
conv_util
::
GetHostTensorDescriptor
(
dims
,
tl
::
NHWC
{});
res
=
cmp_vec
(
h
.
GetLengths
(),
{
2
,
3
,
4
,
5
},
"Error: wrong NHWC dimensions lengths!"
);
HostTensorDescriptor
h
=
ck
::
utils
::
conv
::
get_host_tensor_descriptor
(
dims
,
tl
::
NHWC
{});
res
=
cmp_vec
(
h
.
GetStrides
(),
{
3
*
4
*
5
,
1
,
3
*
5
,
3
},
"Error: wrong NHWC dimensions strides!"
);
ck
::
utils
::
check_err
(
h
.
GetLengths
(),
{
2
,
3
,
4
,
5
},
"Error: wrong NHWC dimensions lengths!"
);
res
=
ck
::
utils
::
check_err
(
h
.
GetStrides
(),
{
3
*
4
*
5
,
1
,
3
*
5
,
3
},
"Error: wrong NHWC dimensions strides!"
);
h
=
ck
::
conv_util
::
GetHostTensorDescriptor
(
dims
,
tl
::
NCHW
{});
res
=
cmp_vec
(
h
.
GetLengths
(),
{
2
,
3
,
4
,
5
},
"Error: wrong NCHW dimensions lengths!"
);
h
=
ck
::
utils
::
conv
::
get_host_tensor_descriptor
(
dims
,
tl
::
NCHW
{});
res
=
cmp_vec
(
h
.
GetStrides
(),
{
3
*
4
*
5
,
4
*
5
,
5
,
1
},
"Error: wrong NCHW dimensions strides!"
);
ck
::
utils
::
check_err
(
h
.
GetLengths
(),
{
2
,
3
,
4
,
5
},
"Error: wrong NCHW dimensions lengths!"
);
res
=
ck
::
utils
::
check_err
(
h
.
GetStrides
(),
{
3
*
4
*
5
,
4
*
5
,
5
,
1
},
"Error: wrong NCHW dimensions strides!"
);
dims
=
std
::
vector
<
std
::
size_t
>
{
2
,
3
,
4
};
h
=
ck
::
conv_util
::
GetHostTensorDescriptor
(
dims
,
tl
::
NWC
{});
res
=
cmp_vec
(
h
.
GetLengths
(),
{
2
,
3
,
4
},
"Error: wrong NWC dimensions lengths!"
);
res
=
cmp_vec
(
h
.
GetStrides
(),
{
3
*
4
,
1
,
3
},
"Error: wrong NWC dimensions strides!"
);
h
=
ck
::
utils
::
conv
::
get_host_tensor_descriptor
(
dims
,
tl
::
NWC
{});
res
=
ck
::
utils
::
check_err
(
h
.
GetLengths
(),
{
2
,
3
,
4
},
"Error: wrong NWC dimensions lengths!"
);
res
=
ck
::
utils
::
check_err
(
h
.
GetStrides
(),
{
3
*
4
,
1
,
3
},
"Error: wrong NWC dimensions strides!"
);
h
=
ck
::
conv_util
::
GetHostTensorDescriptor
(
dims
,
tl
::
NCW
{});
res
=
cmp_vec
(
h
.
GetLengths
(),
{
2
,
3
,
4
},
"Error: wrong NCW dimensions lengths!"
);
res
=
cmp_vec
(
h
.
GetStrides
(),
{
3
*
4
,
4
,
1
},
"Error: wrong NCW dimensions strides!"
);
h
=
ck
::
utils
::
conv
::
get_host_tensor_descriptor
(
dims
,
tl
::
NCW
{});
res
=
ck
::
utils
::
check_err
(
h
.
GetLengths
(),
{
2
,
3
,
4
},
"Error: wrong NCW dimensions lengths!"
);
res
=
ck
::
utils
::
check_err
(
h
.
GetStrides
(),
{
3
*
4
,
4
,
1
},
"Error: wrong NCW dimensions strides!"
);
dims
=
std
::
vector
<
std
::
size_t
>
{
2
,
3
,
4
,
5
,
6
};
h
=
ck
::
utils
::
conv
::
get_host_tensor_descriptor
(
dims
,
tl
::
NDHWC
{});
res
=
ck
::
utils
::
check_err
(
h
.
GetLengths
(),
dims
,
"Error: wrong NDHWC dimensions lengths!"
);
res
=
ck
::
utils
::
check_err
(
h
.
GetStrides
(),
{
3
*
4
*
5
*
6
,
// N
1
,
// C
3
*
5
*
6
,
// D
3
*
6
,
// H
3
},
// W
"Error: wrong NDHWC dimensions strides!"
);
h
=
ck
::
utils
::
conv
::
get_host_tensor_descriptor
(
dims
,
tl
::
NCDHW
{});
res
=
ck
::
utils
::
check_err
(
h
.
GetLengths
(),
dims
,
"Error: wrong NCDHW dimensions lengths!"
);
res
=
ck
::
utils
::
check_err
(
h
.
GetStrides
(),
{
3
*
4
*
5
*
6
,
// N
4
*
5
*
6
,
// C
5
*
6
,
// D
6
,
// H
1
},
// W
"Error: wrong NCDHW dimensions strides!"
);
return
res
;
}
...
...
@@ -148,10 +194,11 @@ bool TestGetHostTensorDescriptor()
int
main
(
void
)
{
bool
res
=
TestConvParams_GetOutputSpatialLengths
();
std
::
cout
<<
"TestConvParams_GetOutputSpatialLengths ..... "
<<
(
res
?
"SUCCESS"
:
"FAILURE"
)
bool
res
=
test_conv_params_get_output_spatial_lengths
();
std
::
cout
<<
"test_conv_params_get_output_spatial_lengths ..... "
<<
(
res
?
"SUCCESS"
:
"FAILURE"
)
<<
std
::
endl
;
res
=
test_get_host_tensor_descriptor
();
std
::
cout
<<
"test_get_host_tensor_descriptor ..... "
<<
(
res
?
"SUCCESS"
:
"FAILURE"
)
<<
std
::
endl
;
res
=
TestGetHostTensorDescriptor
();
std
::
cout
<<
"TestGetHostTensorDescriptor ..... "
<<
(
res
?
"SUCCESS"
:
"FAILURE"
)
<<
std
::
endl
;
return
0
;
return
res
?
0
:
1
;
}
test/convnd_bwd_data/CMakeLists.txt
0 → 100644
View file @
dd6a8de4
include_directories
(
BEFORE
${
PROJECT_SOURCE_DIR
}
/profiler/include
${
PROJECT_SOURCE_DIR
}
/external/include/half
)
add_test_executable
(
test_convnd_bwd_data convnd_bwd_data.cpp
)
target_link_libraries
(
test_convnd_bwd_data PRIVATE host_tensor
)
target_link_libraries
(
test_convnd_bwd_data PRIVATE device_convnd_bwd_data_instance
)
test/convnd_bwd_data/convnd_bwd_data.cpp
0 → 100644
View file @
dd6a8de4
#include <iostream>
#include <numeric>
#include <initializer_list>
#include <cstdlib>
#include <stdlib.h>
#include <half.hpp>
#include <vector>
#include "profile_convnd_bwd_data_impl.hpp"
int
main
()
{
bool
pass
=
true
;
// check 1d
std
::
vector
<
ck
::
utils
::
conv
::
ConvParams
>
params
;
params
.
push_back
({
1
,
128
,
128
,
256
,
{
1
},
{
14
},
{
2
},
{
1
},
{
0
},
{
0
}});
params
.
push_back
({
1
,
128
,
128
,
256
,
{
3
},
{
28
},
{
1
},
{
1
},
{
1
},
{
1
}});
params
.
push_back
({
1
,
128
,
128
,
256
,
{
1
},
{
3
},
{
1
},
{
1
},
{
0
},
{
0
}});
for
(
auto
&
param
:
params
)
{
pass
&=
ck
::
profiler
::
profile_convnd_bwd_data_impl
<
1
,
float
,
float
,
float
,
float
,
ck
::
tensor_layout
::
convolution
::
NWC
,
ck
::
tensor_layout
::
convolution
::
KXC
,
ck
::
tensor_layout
::
convolution
::
NWK
>
(
1
,
// do_verification,
1
,
// init_method,
0
,
// do_log,
1
,
// nrepeat,
param
.
N
,
param
.
K
,
param
.
C
,
param
.
input_spatial_lengths
,
param
.
filter_spatial_lengths
,
param
.
GetOutputSpatialLengths
(),
param
.
conv_filter_strides
,
param
.
conv_filter_dilations
,
param
.
input_left_pads
,
param
.
input_right_pads
);
pass
&=
ck
::
profiler
::
profile_convnd_bwd_data_impl
<
1
,
ck
::
half_t
,
ck
::
half_t
,
ck
::
half_t
,
float
,
ck
::
tensor_layout
::
convolution
::
NWC
,
ck
::
tensor_layout
::
convolution
::
KXC
,
ck
::
tensor_layout
::
convolution
::
NWK
>
(
1
,
// do_verification,
1
,
// init_method,
0
,
// do_log,
1
,
// nrepeat,
param
.
N
,
param
.
K
,
param
.
C
,
param
.
input_spatial_lengths
,
param
.
filter_spatial_lengths
,
param
.
GetOutputSpatialLengths
(),
param
.
conv_filter_strides
,
param
.
conv_filter_dilations
,
param
.
input_left_pads
,
param
.
input_right_pads
);
pass
&=
ck
::
profiler
::
profile_convnd_bwd_data_impl
<
1
,
ck
::
bhalf_t
,
ck
::
bhalf_t
,
ck
::
bhalf_t
,
float
,
ck
::
tensor_layout
::
convolution
::
NWC
,
ck
::
tensor_layout
::
convolution
::
KXC
,
ck
::
tensor_layout
::
convolution
::
NWK
>
(
1
,
// do_verification,
1
,
// init_method,
0
,
// do_log,
1
,
// nrepeat,
param
.
N
,
param
.
K
,
param
.
C
,
param
.
input_spatial_lengths
,
param
.
filter_spatial_lengths
,
param
.
GetOutputSpatialLengths
(),
param
.
conv_filter_strides
,
param
.
conv_filter_dilations
,
param
.
input_left_pads
,
param
.
input_right_pads
);
pass
&=
ck
::
profiler
::
profile_convnd_bwd_data_impl
<
1
,
int8_t
,
int8_t
,
int8_t
,
int
,
ck
::
tensor_layout
::
convolution
::
NWC
,
ck
::
tensor_layout
::
convolution
::
KXC
,
ck
::
tensor_layout
::
convolution
::
NWK
>
(
1
,
// do_verification,
1
,
// init_method,
0
,
// do_log,
1
,
// nrepeat,
param
.
N
,
param
.
K
,
param
.
C
,
param
.
input_spatial_lengths
,
param
.
filter_spatial_lengths
,
param
.
GetOutputSpatialLengths
(),
param
.
conv_filter_strides
,
param
.
conv_filter_dilations
,
param
.
input_left_pads
,
param
.
input_right_pads
);
}
// check 2d
params
.
clear
();
params
.
push_back
({
2
,
128
,
128
,
256
,
{
1
,
1
},
{
7
,
7
},
{
2
,
2
},
{
1
,
1
},
{
0
,
0
},
{
0
,
0
}});
params
.
push_back
({
2
,
128
,
128
,
256
,
{
3
,
3
},
{
14
,
14
},
{
1
,
1
},
{
1
,
1
},
{
1
,
1
},
{
1
,
1
}});
params
.
push_back
({
2
,
128
,
128
,
256
,
{
1
,
1
},
{
3
,
3
},
{
1
,
1
},
{
1
,
1
},
{
0
,
0
},
{
0
,
0
}});
for
(
auto
&
param
:
params
)
{
pass
&=
ck
::
profiler
::
profile_convnd_bwd_data_impl
<
2
,
float
,
float
,
float
,
float
,
ck
::
tensor_layout
::
convolution
::
NHWC
,
ck
::
tensor_layout
::
convolution
::
KYXC
,
ck
::
tensor_layout
::
convolution
::
NHWK
>
(
1
,
// do_verification,
1
,
// init_method,
0
,
// do_log,
1
,
// nrepeat,
param
.
N
,
param
.
K
,
param
.
C
,
param
.
input_spatial_lengths
,
param
.
filter_spatial_lengths
,
param
.
GetOutputSpatialLengths
(),
param
.
conv_filter_strides
,
param
.
conv_filter_dilations
,
param
.
input_left_pads
,
param
.
input_right_pads
);
pass
&=
ck
::
profiler
::
profile_convnd_bwd_data_impl
<
2
,
ck
::
half_t
,
ck
::
half_t
,
ck
::
half_t
,
float
,
ck
::
tensor_layout
::
convolution
::
NHWC
,
ck
::
tensor_layout
::
convolution
::
KYXC
,
ck
::
tensor_layout
::
convolution
::
NHWK
>
(
1
,
// do_verification,
1
,
// init_method,
0
,
// do_log,
1
,
// nrepeat,
param
.
N
,
param
.
K
,
param
.
C
,
param
.
input_spatial_lengths
,
param
.
filter_spatial_lengths
,
param
.
GetOutputSpatialLengths
(),
param
.
conv_filter_strides
,
param
.
conv_filter_dilations
,
param
.
input_left_pads
,
param
.
input_right_pads
);
pass
&=
ck
::
profiler
::
profile_convnd_bwd_data_impl
<
2
,
ck
::
bhalf_t
,
ck
::
bhalf_t
,
ck
::
bhalf_t
,
float
,
ck
::
tensor_layout
::
convolution
::
NHWC
,
ck
::
tensor_layout
::
convolution
::
KYXC
,
ck
::
tensor_layout
::
convolution
::
NHWK
>
(
1
,
// do_verification,
1
,
// init_method,
0
,
// do_log,
1
,
// nrepeat,
param
.
N
,
param
.
K
,
param
.
C
,
param
.
input_spatial_lengths
,
param
.
filter_spatial_lengths
,
param
.
GetOutputSpatialLengths
(),
param
.
conv_filter_strides
,
param
.
conv_filter_dilations
,
param
.
input_left_pads
,
param
.
input_right_pads
);
pass
&=
ck
::
profiler
::
profile_convnd_bwd_data_impl
<
2
,
int8_t
,
int8_t
,
int8_t
,
int
,
ck
::
tensor_layout
::
convolution
::
NHWC
,
ck
::
tensor_layout
::
convolution
::
KYXC
,
ck
::
tensor_layout
::
convolution
::
NHWK
>
(
1
,
// do_verification,
1
,
// init_method,
0
,
// do_log,
1
,
// nrepeat,
param
.
N
,
param
.
K
,
param
.
C
,
param
.
input_spatial_lengths
,
param
.
filter_spatial_lengths
,
param
.
GetOutputSpatialLengths
(),
param
.
conv_filter_strides
,
param
.
conv_filter_dilations
,
param
.
input_left_pads
,
param
.
input_right_pads
);
}
// check 3d
params
.
clear
();
params
.
push_back
(
{
3
,
128
,
128
,
256
,
{
1
,
1
,
1
},
{
7
,
7
,
7
},
{
2
,
2
,
2
},
{
1
,
1
,
1
},
{
0
,
0
,
0
},
{
0
,
0
,
0
}});
params
.
push_back
(
{
3
,
128
,
128
,
256
,
{
3
,
3
,
3
},
{
14
,
14
,
14
},
{
1
,
1
,
1
},
{
1
,
1
,
1
},
{
1
,
1
,
1
},
{
1
,
1
,
1
}});
params
.
push_back
(
{
3
,
128
,
128
,
256
,
{
1
,
1
,
1
},
{
3
,
3
,
3
},
{
1
,
1
,
1
},
{
1
,
1
,
1
},
{
0
,
0
,
0
},
{
0
,
0
,
0
}});
for
(
auto
&
param
:
params
)
{
pass
&=
ck
::
profiler
::
profile_convnd_bwd_data_impl
<
3
,
float
,
float
,
float
,
float
,
ck
::
tensor_layout
::
convolution
::
NDHWC
,
ck
::
tensor_layout
::
convolution
::
KZYXC
,
ck
::
tensor_layout
::
convolution
::
NDHWK
>
(
1
,
// do_verification,
1
,
// init_method,
0
,
// do_log,
1
,
// nrepeat,
param
.
N
,
param
.
K
,
param
.
C
,
param
.
input_spatial_lengths
,
param
.
filter_spatial_lengths
,
param
.
GetOutputSpatialLengths
(),
param
.
conv_filter_strides
,
param
.
conv_filter_dilations
,
param
.
input_left_pads
,
param
.
input_right_pads
);
pass
&=
ck
::
profiler
::
profile_convnd_bwd_data_impl
<
3
,
ck
::
half_t
,
ck
::
half_t
,
ck
::
half_t
,
float
,
ck
::
tensor_layout
::
convolution
::
NDHWC
,
ck
::
tensor_layout
::
convolution
::
KZYXC
,
ck
::
tensor_layout
::
convolution
::
NDHWK
>
(
1
,
// do_verification,
1
,
// init_method,
0
,
// do_log,
1
,
// nrepeat,
param
.
N
,
param
.
K
,
param
.
C
,
param
.
input_spatial_lengths
,
param
.
filter_spatial_lengths
,
param
.
GetOutputSpatialLengths
(),
param
.
conv_filter_strides
,
param
.
conv_filter_dilations
,
param
.
input_left_pads
,
param
.
input_right_pads
);
pass
&=
ck
::
profiler
::
profile_convnd_bwd_data_impl
<
3
,
ck
::
bhalf_t
,
ck
::
bhalf_t
,
ck
::
bhalf_t
,
float
,
ck
::
tensor_layout
::
convolution
::
NDHWC
,
ck
::
tensor_layout
::
convolution
::
KZYXC
,
ck
::
tensor_layout
::
convolution
::
NDHWK
>
(
1
,
// do_verification,
1
,
// init_method,
0
,
// do_log,
1
,
// nrepeat,
param
.
N
,
param
.
K
,
param
.
C
,
param
.
input_spatial_lengths
,
param
.
filter_spatial_lengths
,
param
.
GetOutputSpatialLengths
(),
param
.
conv_filter_strides
,
param
.
conv_filter_dilations
,
param
.
input_left_pads
,
param
.
input_right_pads
);
pass
&=
ck
::
profiler
::
profile_convnd_bwd_data_impl
<
3
,
int8_t
,
int8_t
,
int8_t
,
int
,
ck
::
tensor_layout
::
convolution
::
NDHWC
,
ck
::
tensor_layout
::
convolution
::
KZYXC
,
ck
::
tensor_layout
::
convolution
::
NDHWK
>
(
1
,
// do_verification,
1
,
// init_method,
0
,
// do_log,
1
,
// nrepeat,
param
.
N
,
param
.
K
,
param
.
C
,
param
.
input_spatial_lengths
,
param
.
filter_spatial_lengths
,
param
.
GetOutputSpatialLengths
(),
param
.
conv_filter_strides
,
param
.
conv_filter_dilations
,
param
.
input_left_pads
,
param
.
input_right_pads
);
}
if
(
pass
)
{
std
::
cout
<<
"test convnd bwd : Pass"
<<
std
::
endl
;
return
0
;
}
else
{
std
::
cout
<<
"test convnd bwd: Fail "
<<
std
::
endl
;
return
-
1
;
}
}
test/convnd_fwd/CMakeLists.txt
View file @
dd6a8de4
add_test_executable
(
test_convnd_fwd convnd_fwd.cpp
)
target_link_libraries
(
test_convnd_fwd PRIVATE host_tensor
)
add_custom_target
(
test_convnd_fwd
)
add_test_executable
(
test_conv1d_fwd conv1d_fwd.cpp
)
target_link_libraries
(
test_conv1d_fwd PRIVATE host_tensor
)
target_link_libraries
(
test_conv1d_fwd PRIVATE device_conv1d_fwd_instance
)
add_dependencies
(
test_convnd_fwd test_conv1d_fwd
)
add_test_executable
(
test_conv2d_fwd conv2d_fwd.cpp
)
target_link_libraries
(
test_conv2d_fwd PRIVATE host_tensor
)
target_link_libraries
(
test_conv2d_fwd PRIVATE device_conv2d_fwd_instance
)
add_dependencies
(
test_convnd_fwd test_conv2d_fwd
)
add_test_executable
(
test_conv3d_fwd conv3d_fwd.cpp
)
target_link_libraries
(
test_conv3d_fwd PRIVATE host_tensor
)
target_link_libraries
(
test_conv3d_fwd PRIVATE device_conv3d_fwd_instance
)
add_dependencies
(
test_convnd_fwd test_conv3d_fwd
)
test/convnd_fwd/conv1d_fwd.cpp
0 → 100644
View file @
dd6a8de4
#include <iostream>
#include <stdexcept>
#include <tuple>
#include <vector>
#include "data_type.hpp"
#include "element_wise_operation.hpp"
#include "conv_fwd_util.hpp"
#include "conv_util.hpp"
#include "host_tensor.hpp"
#include "tensor_layout.hpp"
#include "check_err.hpp"
// Forward declarations for conv instances.
using
DeviceConvFwdNoOpPtr
=
ck
::
tensor_operation
::
device
::
DeviceConvFwdPtr
<
ck
::
tensor_operation
::
element_wise
::
PassThrough
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
>
;
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
device_conv1d_fwd_instance
{
void
add_device_conv1d_fwd_xdl_nwc_kxc_nwk_bf16_instances
(
std
::
vector
<
DeviceConvFwdNoOpPtr
>&
);
void
add_device_conv1d_fwd_xdl_nwc_kxc_nwk_f16_instances
(
std
::
vector
<
DeviceConvFwdNoOpPtr
>&
);
void
add_device_conv1d_fwd_xdl_nwc_kxc_nwk_f32_instances
(
std
::
vector
<
DeviceConvFwdNoOpPtr
>&
);
void
add_device_conv1d_fwd_xdl_nwc_kxc_nwk_int8_instances
(
std
::
vector
<
DeviceConvFwdNoOpPtr
>&
);
}
// namespace device_conv1d_fwd_instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
namespace
{
bool
test_conv1D_nwc
()
{
bool
res
{
true
};
ck
::
utils
::
conv
::
ConvParams
params
;
params
.
num_dim_spatial
=
1
;
params
.
N
=
2
;
params
.
K
=
16
;
params
.
C
=
4
;
params
.
filter_spatial_lengths
=
std
::
vector
<
ck
::
index_t
>
{
3
};
params
.
input_spatial_lengths
=
std
::
vector
<
ck
::
index_t
>
{
16
};
params
.
conv_filter_strides
=
std
::
vector
<
ck
::
index_t
>
{
1
};
params
.
conv_filter_dilations
=
std
::
vector
<
ck
::
index_t
>
{
1
};
params
.
input_left_pads
=
std
::
vector
<
ck
::
index_t
>
{
1
};
params
.
input_right_pads
=
std
::
vector
<
ck
::
index_t
>
{
1
};
auto
host_tensors
=
ck
::
utils
::
conv
::
get_host_tensors
<
float
,
float
,
float
,
ck
::
tensor_layout
::
convolution
::
NWC
,
ck
::
tensor_layout
::
convolution
::
KXC
,
ck
::
tensor_layout
::
convolution
::
NWK
>
(
params
);
const
Tensor
<
float
>&
input
=
std
::
get
<
0
>
(
host_tensors
);
const
Tensor
<
float
>&
weights
=
std
::
get
<
1
>
(
host_tensors
);
Tensor
<
float
>&
host_output
=
std
::
get
<
2
>
(
host_tensors
);
Tensor
<
float
>&
device_output
=
std
::
get
<
3
>
(
host_tensors
);
ck
::
utils
::
conv
::
run_reference_convolution_forward
<
1
>
(
params
,
input
,
weights
,
host_output
);
test
::
conv
::
RunConv
<
1
>
(
params
,
input
,
weights
,
device_output
);
res
=
res
&&
ck
::
utils
::
check_err
(
device_output
.
mData
,
host_output
.
mData
,
"Error: incorrect results!"
,
1e-5
f
,
1e-4
f
);
return
res
;
}
template
<
typename
T
>
bool
test_conv1d_nwc_instances
(
const
std
::
vector
<
DeviceConvFwdNoOpPtr
>&
conv_ptrs
)
{
ck
::
utils
::
conv
::
ConvParams
params
;
params
.
num_dim_spatial
=
1
;
params
.
filter_spatial_lengths
=
std
::
vector
<
ck
::
index_t
>
{
3
};
params
.
input_spatial_lengths
=
std
::
vector
<
ck
::
index_t
>
{
71
};
params
.
conv_filter_strides
=
std
::
vector
<
ck
::
index_t
>
{
2
};
params
.
conv_filter_dilations
=
std
::
vector
<
ck
::
index_t
>
{
1
};
params
.
input_left_pads
=
std
::
vector
<
ck
::
index_t
>
{
1
};
params
.
input_right_pads
=
std
::
vector
<
ck
::
index_t
>
{
1
};
auto
host_tensors
=
ck
::
utils
::
conv
::
get_host_tensors
<
T
,
T
,
T
,
ck
::
tensor_layout
::
convolution
::
NWC
,
ck
::
tensor_layout
::
convolution
::
KXC
,
ck
::
tensor_layout
::
convolution
::
NWK
>
(
params
);
const
Tensor
<
T
>&
input
=
std
::
get
<
0
>
(
host_tensors
);
const
Tensor
<
T
>&
weights
=
std
::
get
<
1
>
(
host_tensors
);
Tensor
<
T
>&
host_output
=
std
::
get
<
2
>
(
host_tensors
);
Tensor
<
T
>&
device_output
=
std
::
get
<
3
>
(
host_tensors
);
ck
::
utils
::
conv
::
run_reference_convolution_forward
<
1
>
(
params
,
input
,
weights
,
host_output
);
return
ck
::
utils
::
conv
::
run_convolution_forward_instances
<
1
>
(
params
,
conv_ptrs
,
input
,
weights
,
device_output
,
host_output
);
}
bool
test_conv1d_nwc_bf16_instances
()
{
std
::
vector
<
DeviceConvFwdNoOpPtr
>
conv_ptrs
;
ck
::
tensor_operation
::
device
::
device_conv1d_fwd_instance
::
add_device_conv1d_fwd_xdl_nwc_kxc_nwk_bf16_instances
(
conv_ptrs
);
return
test_conv1d_nwc_instances
<
ck
::
bhalf_t
>
(
conv_ptrs
);
}
bool
test_conv1d_nwc_f16_instances
()
{
std
::
vector
<
DeviceConvFwdNoOpPtr
>
conv_ptrs
;
ck
::
tensor_operation
::
device
::
device_conv1d_fwd_instance
::
add_device_conv1d_fwd_xdl_nwc_kxc_nwk_f16_instances
(
conv_ptrs
);
return
test_conv1d_nwc_instances
<
ck
::
half_t
>
(
conv_ptrs
);
}
bool
test_conv1d_nwc_f32_instances
()
{
std
::
vector
<
DeviceConvFwdNoOpPtr
>
conv_ptrs
;
ck
::
tensor_operation
::
device
::
device_conv1d_fwd_instance
::
add_device_conv1d_fwd_xdl_nwc_kxc_nwk_f32_instances
(
conv_ptrs
);
return
test_conv1d_nwc_instances
<
float
>
(
conv_ptrs
);
}
bool
test_conv1d_nwc_int8_instances
()
{
std
::
vector
<
DeviceConvFwdNoOpPtr
>
conv_ptrs
;
ck
::
tensor_operation
::
device
::
device_conv1d_fwd_instance
::
add_device_conv1d_fwd_xdl_nwc_kxc_nwk_int8_instances
(
conv_ptrs
);
return
test_conv1d_nwc_instances
<
int8_t
>
(
conv_ptrs
);
}
}
// anonymous namespace
int
main
()
{
bool
res
{
true
};
res
=
test_conv1D_nwc
();
std
::
cout
<<
"test_conv1D_nwc ..... "
<<
(
res
?
"SUCCESS"
:
"FAILURE"
)
<<
std
::
endl
;
res
=
test_conv1d_nwc_bf16_instances
();
std
::
cout
<<
"
\n
TestConv1DNWCBF16Instances ..... "
<<
(
res
?
"SUCCESS"
:
"FAILURE"
)
<<
std
::
endl
;
res
=
test_conv1d_nwc_f16_instances
();
std
::
cout
<<
"
\n
test_conv1d_nwc_f16_instances ..... "
<<
(
res
?
"SUCCESS"
:
"FAILURE"
)
<<
std
::
endl
;
res
=
test_conv1d_nwc_f32_instances
();
std
::
cout
<<
"
\n
test_conv1d_nwc_f32_instances ..... "
<<
(
res
?
"SUCCESS"
:
"FAILURE"
)
<<
std
::
endl
;
res
=
test_conv1d_nwc_int8_instances
();
std
::
cout
<<
"
\n
tes_tconv1_dnw_cint_8instances ..... "
<<
(
res
?
"SUCCESS"
:
"FAILURE"
)
<<
std
::
endl
;
return
res
?
0
:
1
;
}
test/convnd_fwd/conv2d_fwd.cpp
0 → 100644
View file @
dd6a8de4
#include <half.hpp>
#include <iostream>
#include <stdexcept>
#include <tuple>
#include <vector>
#include "data_type.hpp"
#include "element_wise_operation.hpp"
#include "conv_fwd_util.hpp"
#include "conv_util.hpp"
#include "host_tensor.hpp"
#include "tensor_layout.hpp"
#include "check_err.hpp"
// Forward declarations for conv instances.
using
DeviceConvFwdNoOpPtr
=
ck
::
tensor_operation
::
device
::
DeviceConvFwdPtr
<
ck
::
tensor_operation
::
element_wise
::
PassThrough
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
>
;
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
device_conv2d_fwd_instance
{
void
add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_bf16_instances
(
std
::
vector
<
DeviceConvFwdNoOpPtr
>&
);
void
add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f16_instances
(
std
::
vector
<
DeviceConvFwdNoOpPtr
>&
);
void
add_device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk_f16_instances
(
std
::
vector
<
DeviceConvFwdNoOpPtr
>&
);
void
add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f32_instances
(
std
::
vector
<
DeviceConvFwdNoOpPtr
>&
);
void
add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_int8_instances
(
std
::
vector
<
DeviceConvFwdNoOpPtr
>&
);
}
// namespace device_conv2d_fwd_instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
namespace
{
bool
test_conv2d_nhwc
()
{
bool
res
{
true
};
ck
::
utils
::
conv
::
ConvParams
params
;
params
.
N
=
2
;
params
.
K
=
16
;
params
.
C
=
4
;
params
.
input_spatial_lengths
=
std
::
vector
<
ck
::
index_t
>
{
16
,
16
};
params
.
conv_filter_strides
=
std
::
vector
<
ck
::
index_t
>
{
1
,
1
};
auto
host_tensors
=
ck
::
utils
::
conv
::
get_host_tensors
(
params
);
const
Tensor
<
float
>&
input
=
std
::
get
<
0
>
(
host_tensors
);
const
Tensor
<
float
>&
weights
=
std
::
get
<
1
>
(
host_tensors
);
Tensor
<
float
>&
host_output
=
std
::
get
<
2
>
(
host_tensors
);
Tensor
<
float
>&
device_output
=
std
::
get
<
3
>
(
host_tensors
);
ck
::
utils
::
conv
::
run_reference_convolution_forward
<
2
>
(
params
,
input
,
weights
,
host_output
);
test
::
conv
::
RunConv
<
2
>
(
params
,
input
,
weights
,
device_output
);
res
=
res
&&
ck
::
utils
::
check_err
(
device_output
.
mData
,
host_output
.
mData
,
"Error: incorrect results!"
,
1e-5
f
,
1e-4
f
);
return
res
;
}
template
<
typename
T
>
bool
test_conv2d_nhwc_instances
(
const
std
::
vector
<
DeviceConvFwdNoOpPtr
>&
conv_ptrs
)
{
ck
::
utils
::
conv
::
ConvParams
params
;
params
.
num_dim_spatial
=
2
;
params
.
filter_spatial_lengths
=
std
::
vector
<
ck
::
index_t
>
{
3
,
3
};
params
.
input_spatial_lengths
=
std
::
vector
<
ck
::
index_t
>
{
71
,
71
};
params
.
conv_filter_strides
=
std
::
vector
<
ck
::
index_t
>
{
2
,
2
};
params
.
conv_filter_dilations
=
std
::
vector
<
ck
::
index_t
>
{
1
,
1
};
params
.
input_left_pads
=
std
::
vector
<
ck
::
index_t
>
{
1
,
1
};
params
.
input_right_pads
=
std
::
vector
<
ck
::
index_t
>
{
1
,
1
};
auto
host_tensors
=
ck
::
utils
::
conv
::
get_host_tensors
<
T
,
T
,
T
,
ck
::
tensor_layout
::
convolution
::
NHWC
,
ck
::
tensor_layout
::
convolution
::
KYXC
,
ck
::
tensor_layout
::
convolution
::
NHWK
>
(
params
);
const
Tensor
<
T
>&
input
=
std
::
get
<
0
>
(
host_tensors
);
const
Tensor
<
T
>&
weights
=
std
::
get
<
1
>
(
host_tensors
);
Tensor
<
T
>&
host_output
=
std
::
get
<
2
>
(
host_tensors
);
Tensor
<
T
>&
device_output
=
std
::
get
<
3
>
(
host_tensors
);
ck
::
utils
::
conv
::
run_reference_convolution_forward
<
2
>
(
params
,
input
,
weights
,
host_output
);
return
ck
::
utils
::
conv
::
run_convolution_forward_instances
<
2
>
(
params
,
conv_ptrs
,
input
,
weights
,
device_output
,
host_output
);
}
bool
test_conv2d_nhwc_bf16_instances
()
{
std
::
vector
<
DeviceConvFwdNoOpPtr
>
conv_ptrs
;
ck
::
tensor_operation
::
device
::
device_conv2d_fwd_instance
::
add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_bf16_instances
(
conv_ptrs
);
return
test_conv2d_nhwc_instances
<
ck
::
bhalf_t
>
(
conv_ptrs
);
}
bool
test_conv2d_nhwc_f16_instances
()
{
std
::
vector
<
DeviceConvFwdNoOpPtr
>
conv_ptrs
;
ck
::
tensor_operation
::
device
::
device_conv2d_fwd_instance
::
add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f16_instances
(
conv_ptrs
);
ck
::
tensor_operation
::
device
::
device_conv2d_fwd_instance
::
add_device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk_f16_instances
(
conv_ptrs
);
return
test_conv2d_nhwc_instances
<
ck
::
half_t
>
(
conv_ptrs
);
}
bool
test_conv2d_nhwc_f32_instances
()
{
std
::
vector
<
DeviceConvFwdNoOpPtr
>
conv_ptrs
;
ck
::
tensor_operation
::
device
::
device_conv2d_fwd_instance
::
add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f32_instances
(
conv_ptrs
);
return
test_conv2d_nhwc_instances
<
float
>
(
conv_ptrs
);
}
bool
test_conv2d_nhwc_int8_instances
()
{
std
::
vector
<
DeviceConvFwdNoOpPtr
>
conv_ptrs
;
ck
::
tensor_operation
::
device
::
device_conv2d_fwd_instance
::
add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_int8_instances
(
conv_ptrs
);
return
test_conv2d_nhwc_instances
<
int8_t
>
(
conv_ptrs
);
}
}
// anonymous namespace
int
main
()
{
bool
res
{
true
};
res
=
test_conv2d_nhwc
();
std
::
cout
<<
"test_conv2d_nhwc ..... "
<<
(
res
?
"SUCCESS"
:
"FAILURE"
)
<<
std
::
endl
;
res
=
test_conv2d_nhwc_bf16_instances
();
std
::
cout
<<
"
\n
test_conv2d_nhwc_bf16_instances ..... "
<<
(
res
?
"SUCCESS"
:
"FAILURE"
)
<<
std
::
endl
;
res
=
test_conv2d_nhwc_f16_instances
();
std
::
cout
<<
"
\n
test_conv2d_nhwc_f16_instances ....."
<<
(
res
?
"SUCCESS"
:
"FAILURE"
)
<<
std
::
endl
;
res
=
test_conv2d_nhwc_f32_instances
();
std
::
cout
<<
"
\n
test_conv2d_nhwc_f32_instances ..... "
<<
(
res
?
"SUCCESS"
:
"FAILURE"
)
<<
std
::
endl
;
res
=
test_conv2d_nhwc_int8_instances
();
std
::
cout
<<
"
\n
test_conv2d_nhwc_int8_instances ..... "
<<
(
res
?
"SUCCESS"
:
"FAILURE"
)
<<
std
::
endl
;
return
res
?
0
:
1
;
}
test/convnd_fwd/conv3d_fwd.cpp
0 → 100644
View file @
dd6a8de4
#include <half.hpp>
#include <iostream>
#include <stdexcept>
#include <tuple>
#include <vector>
#include "data_type.hpp"
#include "element_wise_operation.hpp"
#include "conv_fwd_util.hpp"
#include "conv_util.hpp"
#include "host_tensor.hpp"
#include "tensor_layout.hpp"
#include "check_err.hpp"
// Forward declarations for conv instances.
using
DeviceConvFwdNoOpPtr
=
ck
::
tensor_operation
::
device
::
DeviceConvFwdPtr
<
ck
::
tensor_operation
::
element_wise
::
PassThrough
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
>
;
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
device_conv3d_fwd_instance
{
void
add_device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk_bf16_instances
(
std
::
vector
<
DeviceConvFwdNoOpPtr
>&
);
void
add_device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk_f16_instances
(
std
::
vector
<
DeviceConvFwdNoOpPtr
>&
);
void
add_device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk_f32_instances
(
std
::
vector
<
DeviceConvFwdNoOpPtr
>&
);
void
add_device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk_int8_instances
(
std
::
vector
<
DeviceConvFwdNoOpPtr
>&
);
}
// namespace device_conv3d_fwd_instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
namespace
{
bool
test_conv3d_ndhwc
()
{
bool
res
{
true
};
ck
::
utils
::
conv
::
ConvParams
params
;
params
.
num_dim_spatial
=
3
;
params
.
N
=
2
;
params
.
K
=
16
;
params
.
C
=
4
;
params
.
filter_spatial_lengths
=
std
::
vector
<
ck
::
index_t
>
{
3
,
3
,
3
};
params
.
input_spatial_lengths
=
std
::
vector
<
ck
::
index_t
>
{
16
,
16
,
16
};
params
.
conv_filter_strides
=
std
::
vector
<
ck
::
index_t
>
{
1
,
1
,
1
};
params
.
conv_filter_dilations
=
std
::
vector
<
ck
::
index_t
>
{
1
,
1
,
1
};
params
.
input_left_pads
=
std
::
vector
<
ck
::
index_t
>
{
1
,
1
,
1
};
params
.
input_right_pads
=
std
::
vector
<
ck
::
index_t
>
{
1
,
1
,
1
};
auto
host_tensors
=
ck
::
utils
::
conv
::
get_host_tensors
<
float
,
float
,
float
,
ck
::
tensor_layout
::
convolution
::
NDHWC
,
ck
::
tensor_layout
::
convolution
::
KZYXC
,
ck
::
tensor_layout
::
convolution
::
NDHWK
>
(
params
);
const
Tensor
<
float
>&
input
=
std
::
get
<
0
>
(
host_tensors
);
const
Tensor
<
float
>&
weights
=
std
::
get
<
1
>
(
host_tensors
);
Tensor
<
float
>&
host_output
=
std
::
get
<
2
>
(
host_tensors
);
Tensor
<
float
>&
device_output
=
std
::
get
<
3
>
(
host_tensors
);
ck
::
utils
::
conv
::
run_reference_convolution_forward
<
3
>
(
params
,
input
,
weights
,
host_output
);
test
::
conv
::
RunConv
<
3
>
(
params
,
input
,
weights
,
device_output
);
res
=
res
&&
ck
::
utils
::
check_err
(
device_output
.
mData
,
host_output
.
mData
,
"Error: incorrect results!"
,
1e-5
f
,
1e-4
f
);
return
res
;
}
bool
test_conv3d_ndhwc_2gb_input
()
{
// >2GB Input
ck
::
utils
::
conv
::
ConvParams
params
;
params
.
num_dim_spatial
=
3
;
params
.
N
=
2
;
params
.
K
=
16
;
params
.
C
=
32
;
params
.
filter_spatial_lengths
=
std
::
vector
<
ck
::
index_t
>
{
3
,
3
,
3
};
params
.
input_spatial_lengths
=
std
::
vector
<
ck
::
index_t
>
{
32
,
1000
,
1000
};
params
.
conv_filter_strides
=
std
::
vector
<
ck
::
index_t
>
{
1
,
1
,
1
};
params
.
conv_filter_dilations
=
std
::
vector
<
ck
::
index_t
>
{
1
,
1
,
1
};
params
.
input_left_pads
=
std
::
vector
<
ck
::
index_t
>
{
1
,
1
,
1
};
params
.
input_right_pads
=
std
::
vector
<
ck
::
index_t
>
{
1
,
1
,
1
};
auto
host_tensors
=
ck
::
utils
::
conv
::
get_host_tensors
<
float
,
float
,
float
,
ck
::
tensor_layout
::
convolution
::
NDHWC
,
ck
::
tensor_layout
::
convolution
::
KZYXC
,
ck
::
tensor_layout
::
convolution
::
NDHWK
>
(
params
,
false
);
const
Tensor
<
float
>&
input
=
std
::
get
<
0
>
(
host_tensors
);
const
Tensor
<
float
>&
weights
=
std
::
get
<
1
>
(
host_tensors
);
Tensor
<
float
>&
device_output
=
std
::
get
<
3
>
(
host_tensors
);
try
{
test
::
conv
::
RunConv
<
3
>
(
params
,
input
,
weights
,
device_output
);
}
catch
(
const
std
::
runtime_error
&
err
)
{
std
::
string
err_msg
{
"Error! device_conv with the specified compilation parameters does "
"not support this Conv problem"
};
if
(
err
.
what
()
!=
err_msg
)
{
return
false
;
}
return
true
;
}
std
::
cout
<<
"Error: Failure checking oversized tensor!"
<<
std
::
endl
;
return
false
;
}
bool
test_conv3d_ndhwc_2gb_filters
()
{
// >2GB Filters
ck
::
utils
::
conv
::
ConvParams
params
;
params
.
num_dim_spatial
=
3
;
params
.
N
=
2
;
params
.
K
=
16
;
params
.
C
=
32
;
params
.
filter_spatial_lengths
=
std
::
vector
<
ck
::
index_t
>
{
4
,
1000
,
1000
};
params
.
input_spatial_lengths
=
std
::
vector
<
ck
::
index_t
>
{
16
,
16
,
16
};
params
.
conv_filter_strides
=
std
::
vector
<
ck
::
index_t
>
{
1
,
1
,
1
};
params
.
conv_filter_dilations
=
std
::
vector
<
ck
::
index_t
>
{
1
,
1
,
1
};
params
.
input_left_pads
=
std
::
vector
<
ck
::
index_t
>
{
1
,
1
,
1
};
params
.
input_right_pads
=
std
::
vector
<
ck
::
index_t
>
{
1
,
1
,
1
};
auto
host_tensors
=
ck
::
utils
::
conv
::
get_host_tensors
<
float
,
float
,
float
,
ck
::
tensor_layout
::
convolution
::
NDHWC
,
ck
::
tensor_layout
::
convolution
::
KZYXC
,
ck
::
tensor_layout
::
convolution
::
NDHWK
>
(
params
,
false
);
const
Tensor
<
float
>&
input
=
std
::
get
<
0
>
(
host_tensors
);
const
Tensor
<
float
>&
weights
=
std
::
get
<
1
>
(
host_tensors
);
Tensor
<
float
>&
device_output
=
std
::
get
<
3
>
(
host_tensors
);
try
{
test
::
conv
::
RunConv
<
3
>
(
params
,
input
,
weights
,
device_output
);
}
catch
(
const
std
::
runtime_error
&
err
)
{
std
::
string
err_msg
{
"Error! device_conv with the specified compilation parameters does "
"not support this Conv problem"
};
if
(
err
.
what
()
!=
err_msg
)
{
return
false
;
}
return
true
;
}
std
::
cout
<<
"Error: Failure checking oversized tensor!"
<<
std
::
endl
;
return
false
;
}
bool
test_conv3d_ndhwc_2gb_output
()
{
// >2GB Output
ck
::
utils
::
conv
::
ConvParams
params
;
params
.
num_dim_spatial
=
3
;
params
.
N
=
2
;
params
.
K
=
16
;
params
.
C
=
2
;
params
.
filter_spatial_lengths
=
std
::
vector
<
ck
::
index_t
>
{
1
,
1
,
1
};
params
.
input_spatial_lengths
=
std
::
vector
<
ck
::
index_t
>
{
1000
,
1000
,
30
};
params
.
conv_filter_strides
=
std
::
vector
<
ck
::
index_t
>
{
1
,
1
,
1
};
params
.
conv_filter_dilations
=
std
::
vector
<
ck
::
index_t
>
{
1
,
1
,
1
};
params
.
input_left_pads
=
std
::
vector
<
ck
::
index_t
>
{
2
,
2
,
2
};
params
.
input_right_pads
=
std
::
vector
<
ck
::
index_t
>
{
2
,
2
,
2
};
auto
host_tensors
=
ck
::
utils
::
conv
::
get_host_tensors
<
float
,
float
,
float
,
ck
::
tensor_layout
::
convolution
::
NDHWC
,
ck
::
tensor_layout
::
convolution
::
KZYXC
,
ck
::
tensor_layout
::
convolution
::
NDHWK
>
(
params
,
false
);
const
Tensor
<
float
>&
input
=
std
::
get
<
0
>
(
host_tensors
);
const
Tensor
<
float
>&
weights
=
std
::
get
<
1
>
(
host_tensors
);
Tensor
<
float
>&
device_output
=
std
::
get
<
3
>
(
host_tensors
);
try
{
test
::
conv
::
RunConv
<
3
>
(
params
,
input
,
weights
,
device_output
);
}
catch
(
const
std
::
runtime_error
&
err
)
{
std
::
string
err_msg
{
"Error! device_conv with the specified compilation parameters does "
"not support this Conv problem"
};
if
(
err
.
what
()
!=
err_msg
)
{
return
false
;
}
return
true
;
}
std
::
cout
<<
"Error: Failure checking oversized tensor!"
<<
std
::
endl
;
return
false
;
}
template
<
typename
T
>
bool
test_conv3d_ndhwc_instances
(
const
std
::
vector
<
DeviceConvFwdNoOpPtr
>&
conv_ptrs
)
{
ck
::
utils
::
conv
::
ConvParams
params
;
params
.
N
=
64
;
params
.
num_dim_spatial
=
3
;
params
.
filter_spatial_lengths
=
std
::
vector
<
ck
::
index_t
>
{
3
,
3
,
2
};
params
.
input_spatial_lengths
=
std
::
vector
<
ck
::
index_t
>
{
32
,
32
,
2
};
params
.
conv_filter_strides
=
std
::
vector
<
ck
::
index_t
>
{
2
,
2
,
2
};
params
.
conv_filter_dilations
=
std
::
vector
<
ck
::
index_t
>
{
1
,
1
,
1
};
params
.
input_left_pads
=
std
::
vector
<
ck
::
index_t
>
{
1
,
1
,
1
};
params
.
input_right_pads
=
std
::
vector
<
ck
::
index_t
>
{
1
,
1
,
1
};
auto
host_tensors
=
ck
::
utils
::
conv
::
get_host_tensors
<
T
,
T
,
T
,
ck
::
tensor_layout
::
convolution
::
NDHWC
,
ck
::
tensor_layout
::
convolution
::
KZYXC
,
ck
::
tensor_layout
::
convolution
::
NDHWK
>
(
params
);
const
Tensor
<
T
>&
input
=
std
::
get
<
0
>
(
host_tensors
);
const
Tensor
<
T
>&
weights
=
std
::
get
<
1
>
(
host_tensors
);
Tensor
<
T
>&
host_output
=
std
::
get
<
2
>
(
host_tensors
);
Tensor
<
T
>&
device_output
=
std
::
get
<
3
>
(
host_tensors
);
ck
::
utils
::
conv
::
run_reference_convolution_forward
<
3
>
(
params
,
input
,
weights
,
host_output
);
return
ck
::
utils
::
conv
::
run_convolution_forward_instances
<
3
>
(
params
,
conv_ptrs
,
input
,
weights
,
device_output
,
host_output
);
}
bool
test_conv3d_ndhwc_bf16_instances
()
{
std
::
vector
<
DeviceConvFwdNoOpPtr
>
conv_ptrs
;
ck
::
tensor_operation
::
device
::
device_conv3d_fwd_instance
::
add_device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk_bf16_instances
(
conv_ptrs
);
return
test_conv3d_ndhwc_instances
<
ck
::
bhalf_t
>
(
conv_ptrs
);
}
bool
test_conv3d_ndhwc_f16_instances
()
{
std
::
vector
<
DeviceConvFwdNoOpPtr
>
conv_ptrs
;
ck
::
tensor_operation
::
device
::
device_conv3d_fwd_instance
::
add_device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk_f16_instances
(
conv_ptrs
);
return
test_conv3d_ndhwc_instances
<
ck
::
half_t
>
(
conv_ptrs
);
}
bool
test_conv3d_ndhwc_f32_instances
()
{
std
::
vector
<
DeviceConvFwdNoOpPtr
>
conv_ptrs
;
ck
::
tensor_operation
::
device
::
device_conv3d_fwd_instance
::
add_device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk_f32_instances
(
conv_ptrs
);
return
test_conv3d_ndhwc_instances
<
float
>
(
conv_ptrs
);
}
bool
test_conv3d_ndhwc_int8_instances
()
{
std
::
vector
<
DeviceConvFwdNoOpPtr
>
conv_ptrs
;
ck
::
tensor_operation
::
device
::
device_conv3d_fwd_instance
::
add_device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk_int8_instances
(
conv_ptrs
);
return
test_conv3d_ndhwc_instances
<
int8_t
>
(
conv_ptrs
);
}
}
// anonymous namespace
int
main
()
{
bool
res
{
true
};
res
=
test_conv3d_ndhwc
();
std
::
cout
<<
"test_conv3d_ndhwc ..... "
<<
(
res
?
"SUCCESS"
:
"FAILURE"
)
<<
std
::
endl
;
res
=
test_conv3d_ndhwc_2gb_input
();
std
::
cout
<<
"
\n
test_conv3d_ndhwc_2gb_input ..... "
<<
(
res
?
"SUCCESS"
:
"FAILURE"
)
<<
std
::
endl
;
res
=
test_conv3d_ndhwc_2gb_filters
();
std
::
cout
<<
"
\n
test_conv3d_ndhwc_2gb_filters ..... "
<<
(
res
?
"SUCCESS"
:
"FAILURE"
)
<<
std
::
endl
;
res
=
test_conv3d_ndhwc_2gb_output
();
std
::
cout
<<
"
\n
test_conv3d_ndhwc_2gb_output ..... "
<<
(
res
?
"SUCCESS"
:
"FAILURE"
)
<<
std
::
endl
;
res
=
test_conv3d_ndhwc_bf16_instances
();
std
::
cout
<<
"
\n
test_conv3d_ndhwc_bf16_instances ..... "
<<
(
res
?
"SUCCESS"
:
"FAILURE"
)
<<
std
::
endl
;
res
=
test_conv3d_ndhwc_f16_instances
();
std
::
cout
<<
"
\n
test_conv3d_ndhwc_f16_instances ..... "
<<
(
res
?
"SUCCESS"
:
"FAILURE"
)
<<
std
::
endl
;
res
=
test_conv3d_ndhwc_f32_instances
();
std
::
cout
<<
"
\n
test_conv3d_ndhwc_f32_instances ..... "
<<
(
res
?
"SUCCESS"
:
"FAILURE"
)
<<
std
::
endl
;
res
=
test_conv3d_ndhwc_int8_instances
();
std
::
cout
<<
"
\n
test_conv3d_ndhw_cint_8instances ..... "
<<
(
res
?
"SUCCESS"
:
"FAILURE"
)
<<
std
::
endl
;
return
res
?
0
:
1
;
}
test/convnd_fwd/conv_util.hpp
0 → 100644
View file @
dd6a8de4
#ifndef TEST_CONV_UTIL_HPP
#define TEST_CONV_UTIL_HPP
#include <tuple>
#include "config.hpp"
#include "conv_fwd_util.hpp"
#include "device_convnd_fwd_xdl_nhwc_kyxc_nhwk.hpp"
#include "element_wise_operation.hpp"
#include "host_tensor.hpp"
#include "sequence.hpp"
namespace
{
template
<
ck
::
index_t
...
Is
>
using
S
=
ck
::
Sequence
<
Is
...
>
;
using
InElementOp
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
WeiElementOp
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
OutElementOp
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
static
constexpr
auto
ConvFwdDefault
=
ck
::
tensor_operation
::
device
::
ConvolutionForwardSpecialization
::
Default
;
template
<
ck
::
index_t
SpatialDims
,
typename
InDataType
,
typename
WeiDataType
,
typename
OutDataType
>
using
DeviceConvNDFwdInstance
=
ck
::
tensor_operation
::
device
::
DeviceConvNDFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
<
// clang-format off
InDataType
,
//
WeiDataType
,
//
OutDataType
,
//
InDataType
,
//
InElementOp
,
// Input Elementwise Operation
WeiElementOp
,
// Weights Elementwise Operation
OutElementOp
,
// Output Elementwise Operation
ConvFwdDefault
,
// ConvForwardSpecialization
SpatialDims
,
// SptialDims
64
,
// BlockSize
16
,
// MPerBlock
16
,
// NPerBlock
4
,
// K0PerBlock
1
,
// K1
16
,
// MPerXDL
16
,
// NPerXDL
1
,
// MXdlPerWave
1
,
// NXdlPerWave
S
<
1
,
16
,
1
>
,
// ABlockTransferThreadClusterLengths_K0_M_K1
S
<
1
,
0
,
2
>
,
// ABlockTransferThreadClusterArrangeOrder
S
<
1
,
0
,
2
>
,
// ABlockTransferSrcAccessOrder
2
,
// ABlockTransferSrcVectorDim
1
,
// ABlockTransferSrcScalarPerVector
1
,
// ABlockTransferDstScalarPerVector_K1
true
,
// ABlockLdsAddExtraM
S
<
1
,
16
,
1
>
,
// BBlockTransferThreadClusterLengths_K0_N_K1
S
<
1
,
0
,
2
>
,
// BBlockTransferThreadClusterArrangeOrder
S
<
1
,
0
,
2
>
,
// BBlockTransferSrcAccessOrder
2
,
// BBlockTransferSrcVectorDim
1
,
// BBlockTransferSrcScalarPerVector
1
,
// BBlockTransferDstScalarPerVector_K1
true
,
// BBlockTransferAddExtraN
7
,
// CThreadTransferSrcDstVectorDim
1
>
;
// CThreadTransferDstScalarPerVector
// clang-format on
}
// namespace
namespace
test
{
namespace
conv
{
template
<
ck
::
index_t
NDim
,
typename
InDataType
=
float
,
typename
WeiDataType
=
float
,
typename
OutDataType
=
float
>
void
RunConv
(
const
ck
::
utils
::
conv
::
ConvParams
&
params
,
const
Tensor
<
InDataType
>&
input
,
const
Tensor
<
WeiDataType
>&
weights
,
Tensor
<
OutDataType
>&
output
)
{
ck
::
utils
::
conv
::
run_convolution_forward
<
NDim
,
InDataType
,
WeiDataType
,
OutDataType
,
DeviceConvNDFwdInstance
>
(
params
,
input
,
weights
,
output
);
}
}
// namespace conv
}
// namespace test
#endif
test/convnd_fwd/convnd_fwd.cpp
deleted
100644 → 0
View file @
0aa899aa
#include <algorithm>
#include <cstdlib>
#include <half.hpp>
#include <iostream>
#include <numeric>
#include <tuple>
#include <vector>
#include "config.hpp"
#include "conv_utils.hpp"
#include "device.hpp"
#include "device_tensor.hpp"
#include "device_convnd_fwd_xdl_nhwc_kyxc_nhwk.hpp"
#include "element_wise_operation.hpp"
#include "host_tensor.hpp"
#include "reference_conv_fwd.hpp"
#include "tensor_layout.hpp"
#include "test_util.hpp"
namespace
{
template
<
ck
::
index_t
...
Is
>
using
S
=
ck
::
Sequence
<
Is
...
>
;
using
InElementOp
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
WeiElementOp
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
OutElementOp
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
static
constexpr
auto
ConvFwdDefault
=
ck
::
tensor_operation
::
device
::
ConvolutionForwardSpecialization_t
::
Default
;
template
<
ck
::
index_t
SpatialDims
,
typename
InDataType
,
typename
WeiDataType
,
typename
OutDataType
>
using
DeviceConvNDFwdInstance
=
ck
::
tensor_operation
::
device
::
DeviceConvNDFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
<
// clang-format off
InDataType
,
//
WeiDataType
,
//
OutDataType
,
//
InDataType
,
//
InElementOp
,
// Input Elementwise Operation
WeiElementOp
,
// Weights Elementwise Operation
OutElementOp
,
// Output Elementwise Operation
ConvFwdDefault
,
// ConvForwardSpecialization
SpatialDims
,
// SptialDims
64
,
// BlockSize
16
,
// MPerBlock
16
,
// NPerBlock
4
,
// K0PerBlock
1
,
// K1
16
,
// MPerXDL
16
,
// NPerXDL
1
,
// MXdlPerWave
1
,
// NXdlPerWave
S
<
1
,
16
,
1
>
,
// ABlockTransferThreadClusterLengths_K0_M_K1
S
<
1
,
0
,
2
>
,
// ABlockTransferThreadClusterArrangeOrder
S
<
1
,
0
,
2
>
,
// ABlockTransferSrcAccessOrder
2
,
// ABlockTransferSrcVectorDim
1
,
// ABlockTransferSrcScalarPerVector
1
,
// ABlockTransferDstScalarPerVector_K1
true
,
// ABlockLdsAddExtraM
S
<
1
,
16
,
1
>
,
// BBlockTransferThreadClusterLengths_K0_N_K1
S
<
1
,
0
,
2
>
,
// BBlockTransferThreadClusterArrangeOrder
S
<
1
,
0
,
2
>
,
// BBlockTransferSrcAccessOrder
2
,
// BBlockTransferSrcVectorDim
1
,
// BBlockTransferSrcScalarPerVector
1
,
// BBlockTransferDstScalarPerVector_K1
true
,
// BBlockTransferAddExtraN
7
,
// CThreadTransferSrcDstVectorDim
1
>
;
// CThreadTransferDstScalarPerVector
// clang-format on
template
<
typename
InDataType
=
float
,
typename
WeiDataType
=
float
,
typename
OutDataType
=
float
,
typename
InLayout
=
ck
::
tensor_layout
::
convolution
::
NHWC
,
typename
WeiLayout
=
ck
::
tensor_layout
::
convolution
::
KYXC
,
typename
OutLayout
=
ck
::
tensor_layout
::
convolution
::
NHWK
>
auto
GetHostTensors
(
const
ck
::
conv_util
::
ConvParams
&
params
)
{
std
::
vector
<
std
::
size_t
>
input_dims
{
static_cast
<
std
::
size_t
>
(
params
.
N
),
static_cast
<
std
::
size_t
>
(
params
.
C
)};
input_dims
.
insert
(
std
::
end
(
input_dims
),
std
::
begin
(
params
.
input_spatial_lengths
),
std
::
end
(
params
.
input_spatial_lengths
));
std
::
vector
<
std
::
size_t
>
filter_dims
{
static_cast
<
std
::
size_t
>
(
params
.
K
),
static_cast
<
std
::
size_t
>
(
params
.
C
)};
filter_dims
.
insert
(
std
::
end
(
filter_dims
),
std
::
begin
(
params
.
filter_spatial_lengths
),
std
::
end
(
params
.
filter_spatial_lengths
));
const
std
::
vector
<
ck
::
index_t
>&
output_spatial_lengths
=
params
.
GetOutputSpatialLengths
();
std
::
vector
<
std
::
size_t
>
output_dims
{
static_cast
<
std
::
size_t
>
(
params
.
N
),
static_cast
<
std
::
size_t
>
(
params
.
K
)};
output_dims
.
insert
(
std
::
end
(
output_dims
),
std
::
begin
(
output_spatial_lengths
),
std
::
end
(
output_spatial_lengths
));
Tensor
<
InDataType
>
input
(
ck
::
conv_util
::
GetHostTensorDescriptor
(
input_dims
,
InLayout
{}));
Tensor
<
WeiDataType
>
weights
(
ck
::
conv_util
::
GetHostTensorDescriptor
(
filter_dims
,
WeiLayout
{}));
Tensor
<
OutDataType
>
host_output
(
ck
::
conv_util
::
GetHostTensorDescriptor
(
output_dims
,
OutLayout
{}));
Tensor
<
OutDataType
>
device_output
(
ck
::
conv_util
::
GetHostTensorDescriptor
(
output_dims
,
OutLayout
{}));
std
::
generate
(
input
.
begin
(),
input
.
end
(),
[
n
=
0
]()
mutable
{
return
InDataType
(
n
++
)
*
InDataType
(
0.1
f
);
});
std
::
fill
(
weights
.
begin
(),
weights
.
end
(),
WeiDataType
(
0.5
f
));
std
::
fill
(
host_output
.
begin
(),
host_output
.
end
(),
OutDataType
(
0.
f
));
std
::
fill
(
device_output
.
begin
(),
device_output
.
end
(),
OutDataType
(
0.
f
));
return
std
::
make_tuple
(
input
,
weights
,
host_output
,
device_output
);
}
template
<
ck
::
index_t
NDim
,
typename
InDataType
=
float
,
typename
WeiDataType
=
float
,
typename
OutDataType
=
float
>
void
RunReferenceConv
(
const
ck
::
conv_util
::
ConvParams
&
params
,
const
Tensor
<
InDataType
>&
input
,
const
Tensor
<
WeiDataType
>&
weights
,
Tensor
<
OutDataType
>&
output
)
{
auto
ref_conv
=
ck
::
tensor_operation
::
host
::
ReferenceConvFwd
<
InDataType
,
WeiDataType
,
OutDataType
,
InElementOp
,
WeiElementOp
,
OutElementOp
,
NDim
>
();
auto
ref_invoker
=
ref_conv
.
MakeInvoker
();
auto
ref_argument
=
ref_conv
.
MakeArgument
(
input
,
weights
,
output
,
params
.
conv_filter_strides
,
params
.
conv_filter_dilations
,
params
.
input_left_pads
,
params
.
input_right_pads
,
InElementOp
{},
WeiElementOp
{},
OutElementOp
{});
ref_invoker
.
Run
(
ref_argument
);
}
template
<
ck
::
index_t
NDim
,
typename
InDataType
=
float
,
typename
WeiDataType
=
float
,
typename
OutDataType
=
float
>
void
RunConv
(
const
ck
::
conv_util
::
ConvParams
&
params
,
const
Tensor
<
InDataType
>&
input
,
const
Tensor
<
WeiDataType
>&
weights
,
Tensor
<
OutDataType
>&
output
)
{
DeviceMem
in_device_buf
(
sizeof
(
InDataType
)
*
input
.
mDesc
.
GetElementSpace
());
DeviceMem
wei_device_buf
(
sizeof
(
WeiDataType
)
*
weights
.
mDesc
.
GetElementSpace
());
DeviceMem
out_device_buf
(
sizeof
(
OutDataType
)
*
output
.
mDesc
.
GetElementSpace
());
in_device_buf
.
ToDevice
(
input
.
mData
.
data
());
wei_device_buf
.
ToDevice
(
weights
.
mData
.
data
());
const
std
::
vector
<
ck
::
index_t
>&
output_spatial_lengths
=
params
.
GetOutputSpatialLengths
();
auto
conv
=
DeviceConvNDFwdInstance
<
NDim
,
InDataType
,
WeiDataType
,
OutDataType
>
();
auto
invoker
=
conv
.
MakeInvoker
();
auto
argument
=
conv
.
MakeArgument
(
static_cast
<
InDataType
*>
(
in_device_buf
.
GetDeviceBuffer
()),
static_cast
<
WeiDataType
*>
(
wei_device_buf
.
GetDeviceBuffer
()),
static_cast
<
OutDataType
*>
(
out_device_buf
.
GetDeviceBuffer
()),
params
.
N
,
params
.
K
,
params
.
C
,
params
.
input_spatial_lengths
,
params
.
filter_spatial_lengths
,
output_spatial_lengths
,
params
.
conv_filter_strides
,
params
.
conv_filter_dilations
,
params
.
input_left_pads
,
params
.
input_right_pads
,
InElementOp
{},
WeiElementOp
{},
OutElementOp
{});
if
(
!
conv
.
IsSupportedArgument
(
argument
))
{
throw
std
::
runtime_error
(
"Error! device_conv with the specified compilation parameters does "
"not support this Conv problem"
);
}
invoker
.
Run
(
argument
);
out_device_buf
.
FromDevice
(
output
.
mData
.
data
());
}
bool
TestConv2DNHWC
()
{
bool
res
{
true
};
ck
::
conv_util
::
ConvParams
params
;
params
.
N
=
2
;
params
.
K
=
16
;
params
.
C
=
4
;
params
.
input_spatial_lengths
=
std
::
vector
<
ck
::
index_t
>
{
16
,
16
};
params
.
conv_filter_strides
=
std
::
vector
<
ck
::
index_t
>
{
1
,
1
};
auto
host_tensors
=
GetHostTensors
(
params
);
const
Tensor
<
float
>&
input
=
std
::
get
<
0
>
(
host_tensors
);
const
Tensor
<
float
>&
weights
=
std
::
get
<
1
>
(
host_tensors
);
Tensor
<
float
>&
host_output
=
std
::
get
<
2
>
(
host_tensors
);
Tensor
<
float
>&
device_output
=
std
::
get
<
3
>
(
host_tensors
);
RunReferenceConv
<
2
>
(
params
,
input
,
weights
,
host_output
);
RunConv
<
2
>
(
params
,
input
,
weights
,
device_output
);
res
=
res
&&
test_util
::
check_err
(
device_output
.
mData
,
host_output
.
mData
,
"Error: incorrect results!"
,
1e-5
f
,
1e-4
f
);
return
res
;
}
bool
TestConv1DNWC
()
{
bool
res
{
true
};
ck
::
conv_util
::
ConvParams
params
;
params
.
num_dim_spatial
=
1
;
params
.
N
=
2
;
params
.
K
=
16
;
params
.
C
=
4
;
params
.
filter_spatial_lengths
=
std
::
vector
<
ck
::
index_t
>
{
3
};
params
.
input_spatial_lengths
=
std
::
vector
<
ck
::
index_t
>
{
16
};
params
.
conv_filter_strides
=
std
::
vector
<
ck
::
index_t
>
{
1
};
params
.
conv_filter_dilations
=
std
::
vector
<
ck
::
index_t
>
{
1
};
params
.
input_left_pads
=
std
::
vector
<
ck
::
index_t
>
{
1
};
params
.
input_right_pads
=
std
::
vector
<
ck
::
index_t
>
{
1
};
auto
host_tensors
=
GetHostTensors
<
float
,
float
,
float
,
ck
::
tensor_layout
::
convolution
::
NWC
,
ck
::
tensor_layout
::
convolution
::
KXC
,
ck
::
tensor_layout
::
convolution
::
NWK
>
(
params
);
const
Tensor
<
float
>&
input
=
std
::
get
<
0
>
(
host_tensors
);
const
Tensor
<
float
>&
weights
=
std
::
get
<
1
>
(
host_tensors
);
Tensor
<
float
>&
host_output
=
std
::
get
<
2
>
(
host_tensors
);
Tensor
<
float
>&
device_output
=
std
::
get
<
3
>
(
host_tensors
);
RunReferenceConv
<
1
>
(
params
,
input
,
weights
,
host_output
);
RunConv
<
1
>
(
params
,
input
,
weights
,
device_output
);
res
=
res
&&
test_util
::
check_err
(
device_output
.
mData
,
host_output
.
mData
,
"Error: incorrect results!"
,
1e-5
f
,
1e-4
f
);
return
res
;
}
}
// anonymous namespace
int
main
()
{
bool
res
{
true
};
res
=
TestConv1DNWC
();
std
::
cout
<<
"TestConv1DNWC ..... "
<<
(
res
?
"SUCCESS"
:
"FAILURE"
)
<<
std
::
endl
;
res
=
TestConv2DNHWC
();
std
::
cout
<<
"TestConv2DNHWC ..... "
<<
(
res
?
"SUCCESS"
:
"FAILURE"
)
<<
std
::
endl
;
}
test/gemm/CMakeLists.txt
View file @
dd6a8de4
...
...
@@ -2,6 +2,10 @@ add_test_executable(test_gemm_fp32 gemm_fp32.cpp)
target_link_libraries
(
test_gemm_fp32 PRIVATE host_tensor
)
target_link_libraries
(
test_gemm_fp32 PRIVATE device_gemm_instance
)
add_test_executable
(
test_gemm_fp16 gemm_fp16.cpp
)
target_link_libraries
(
test_gemm_fp16 PRIVATE host_tensor
)
target_link_libraries
(
test_gemm_fp16 PRIVATE device_gemm_instance
)
add_test_executable
(
test_gemm_bf16 gemm_bf16.cpp
)
target_link_libraries
(
test_gemm_bf16 PRIVATE host_tensor
)
target_link_libraries
(
test_gemm_bf16 PRIVATE device_gemm_instance
)
...
...
test/gemm/gemm_bf16.cpp
View file @
dd6a8de4
...
...
@@ -19,11 +19,10 @@
#include "element_wise_operation.hpp"
#include "reference_gemm.hpp"
#include "gemm_specialization.hpp"
#include "test_util.hpp"
using
PassThrough
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
DeviceGemmPtr
_
=
using
DeviceGemm
NoOp
Ptr
=
ck
::
tensor_operation
::
device
::
DeviceGemmPtr
<
ck
::
tensor_operation
::
element_wise
::
PassThrough
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
>
;
...
...
@@ -32,132 +31,86 @@ namespace ck {
namespace
tensor_operation
{
namespace
device
{
namespace
device_gemm_instance
{
void
add_device_gemm_xdl_c_shuffle_bf16_bf16_bf16_mk_nk_mn_instances
(
std
::
vector
<
DeviceGemmPtr_
>&
);
}
void
add_device_gemm_xdl_c_shuffle_bf16_bf16_bf16_km_kn_mn_instances
(
std
::
vector
<
DeviceGemmNoOpPtr
>&
);
void
add_device_gemm_xdl_c_shuffle_bf16_bf16_bf16_km_nk_mn_instances
(
std
::
vector
<
DeviceGemmNoOpPtr
>&
);
void
add_device_gemm_xdl_c_shuffle_bf16_bf16_bf16_mk_nk_mn_instances
(
std
::
vector
<
DeviceGemmNoOpPtr
>&
);
void
add_device_gemm_xdl_c_shuffle_bf16_bf16_bf16_mk_kn_mn_instances
(
std
::
vector
<
DeviceGemmNoOpPtr
>&
);
}
// namespace device_gemm_instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
namespace
{
using
BF16
=
ck
::
bhalf_t
;
using
ADataType
=
BF16
;
using
BDataType
=
BF16
;
using
CDataType
=
BF16
;
using
AccDataType
=
float
;
using
ALayout
=
ck
::
tensor_layout
::
gemm
::
RowMajor
;
using
BLayout
=
ck
::
tensor_layout
::
gemm
::
ColumnMajor
;
using
CLayout
=
ck
::
tensor_layout
::
gemm
::
RowMajor
;
auto
PrepareGemmTensor
(
const
ck
::
gemm_util
::
GemmParams
&
params
)
{
auto
f_host_tensor_descriptor
=
[](
std
::
size_t
row
,
std
::
size_t
col
,
std
::
size_t
stride
,
auto
layout
)
{
if
(
std
::
is_same
<
decltype
(
layout
),
ck
::
tensor_layout
::
gemm
::
RowMajor
>::
value
)
{
return
HostTensorDescriptor
(
std
::
vector
<
std
::
size_t
>
({
row
,
col
}),
std
::
vector
<
std
::
size_t
>
({
stride
,
1
}));
}
else
{
return
HostTensorDescriptor
(
std
::
vector
<
std
::
size_t
>
({
row
,
col
}),
std
::
vector
<
std
::
size_t
>
({
1
,
stride
}));
}
};
// use fp32 host kernel to verify bf16 device kernel
Tensor
<
ADataType
>
a_m_k_bf16
(
f_host_tensor_descriptor
(
params
.
M
,
params
.
K
,
params
.
StrideA
,
ALayout
{}));
Tensor
<
BDataType
>
b_k_n_bf16
(
f_host_tensor_descriptor
(
params
.
K
,
params
.
N
,
params
.
StrideB
,
BLayout
{}));
Tensor
<
CDataType
>
c_m_n_device_bf16
(
f_host_tensor_descriptor
(
params
.
M
,
params
.
N
,
params
.
StrideC
,
CLayout
{}));
Tensor
<
float
>
a_m_k_fp32
(
f_host_tensor_descriptor
(
params
.
M
,
params
.
K
,
params
.
StrideA
,
ALayout
{}));
Tensor
<
float
>
b_k_n_fp32
(
f_host_tensor_descriptor
(
params
.
K
,
params
.
N
,
params
.
StrideB
,
BLayout
{}));
Tensor
<
float
>
c_m_n_host_fp32
(
f_host_tensor_descriptor
(
params
.
M
,
params
.
N
,
params
.
StrideC
,
CLayout
{}));
Tensor
<
float
>
c_m_n_device_fp32
(
f_host_tensor_descriptor
(
params
.
M
,
params
.
N
,
params
.
StrideC
,
CLayout
{}));
a_m_k_bf16
.
GenerateTensorValue
(
GeneratorTensor_3
<
ADataType
>
{
-
0.5
,
0.5
});
b_k_n_bf16
.
GenerateTensorValue
(
GeneratorTensor_3
<
BDataType
>
{
-
0.5
,
0.5
});
bf16_to_f32_
(
a_m_k_bf16
,
a_m_k_fp32
);
bf16_to_f32_
(
b_k_n_bf16
,
b_k_n_fp32
);
return
std
::
make_tuple
(
a_m_k_bf16
,
b_k_n_bf16
,
c_m_n_device_bf16
,
a_m_k_fp32
,
b_k_n_fp32
,
c_m_n_host_fp32
,
c_m_n_device_fp32
);
}
bool
TestGemm
(
DeviceGemmPtr_
&
gemmPtr
)
int
main
()
{
// Arrange
ck
::
gemm_util
::
GemmParams
params
;
params
.
M
=
1024
;
params
.
N
=
1024
;
params
.
K
=
1024
;
params
.
StrideA
=
1024
;
params
.
StrideB
=
1024
;
params
.
StrideC
=
1024
;
auto
host_tensors
=
PrepareGemmTensor
(
params
);
const
Tensor
<
ADataType
>&
a_bf16
=
std
::
get
<
0
>
(
host_tensors
);
const
Tensor
<
BDataType
>&
b_bf16
=
std
::
get
<
1
>
(
host_tensors
);
Tensor
<
CDataType
>&
c_device_bf16
=
std
::
get
<
2
>
(
host_tensors
);
Tensor
<
float
>&
a_fp32
=
std
::
get
<
3
>
(
host_tensors
);
Tensor
<
float
>&
b_fp32
=
std
::
get
<
4
>
(
host_tensors
);
Tensor
<
float
>&
c_host_fp32
=
std
::
get
<
5
>
(
host_tensors
);
Tensor
<
float
>&
c_device_fp32
=
std
::
get
<
6
>
(
host_tensors
);
using
RowMajor
=
ck
::
tensor_layout
::
gemm
::
RowMajor
;
using
ColumnMajor
=
ck
::
tensor_layout
::
gemm
::
ColumnMajor
;
auto
a_element_op
=
PassThrough
{};
auto
b_element_op
=
PassThrough
{};
auto
c_element_op
=
PassThrough
{};
// use fp32 host kernel to verify bf16 device kernel
using
ReferenceGemmInstance
=
ck
::
tensor_operation
::
host
::
ReferenceGemm
<
float
,
float
,
float
,
PassThrough
,
PassThrough
,
PassThrough
>
;
ck
::
gemm_util
::
RunHostGEMM
<
ReferenceGemmInstance
>
(
a_fp32
,
b_fp32
,
c_host_fp32
,
a_element_op
,
b_element_op
,
c_element_op
);
bool
res
=
true
;
std
::
vector
<
DeviceGemmNoOpPtr
>
gemmPtrs
;
// Act
ck
::
gemm_util
::
RunDeviceGEMM
(
gemmPtr
,
params
,
a_bf16
,
b_bf16
,
c_device_bf16
,
a_element_op
,
b_element_op
,
c_element_op
);
ck
::
tensor_operation
::
device
::
device_gemm_instance
::
add_device_gemm_xdl_c_shuffle_bf16_bf16_bf16_km_kn_mn_instances
(
gemmPtrs
);
bf16_to_f32_
(
c_device_bf16
,
c_device_fp32
);
for
(
auto
&
gemmPtr
:
gemmPtrs
)
{
res
&=
ck
::
gemm_util
::
TestGemmBF16
<
DeviceGemmNoOpPtr
,
ColumnMajor
,
RowMajor
,
RowMajor
,
PassThrough
,
PassThrough
,
PassThrough
>
{}(
gemmPtr
);
}
// Assert
bool
res
=
test_util
::
check_err
(
c
_device_
fp32
.
mData
,
c_host_fp32
.
mData
,
"Error: incorrect results!"
,
1e-2
f
,
1e-3
f
);
gemmPtrs
.
clear
();
ck
::
tensor_operation
::
device
::
device_gemm_instance
::
add
_device_
gemm_xdl_c_shuffle_bf16_bf16_bf16_km_nk_mn_instances
(
gemmPtrs
);
std
::
cout
<<
(
res
?
"SUCCESS"
:
"FAILURE"
)
<<
std
::
endl
;
for
(
auto
&
gemmPtr
:
gemmPtrs
)
{
res
&=
ck
::
gemm_util
::
TestGemmBF16
<
DeviceGemmNoOpPtr
,
ColumnMajor
,
ColumnMajor
,
RowMajor
,
PassThrough
,
PassThrough
,
PassThrough
>
{}(
gemmPtr
);
}
return
res
;
}
gemmPtrs
.
clear
();
ck
::
tensor_operation
::
device
::
device_gemm_instance
::
add_device_gemm_xdl_c_shuffle_bf16_bf16_bf16_mk_kn_mn_instances
(
gemmPtrs
);
}
// anonymous namespace
for
(
auto
&
gemmPtr
:
gemmPtrs
)
{
res
&=
ck
::
gemm_util
::
TestGemmBF16
<
DeviceGemmNoOpPtr
,
RowMajor
,
RowMajor
,
RowMajor
,
PassThrough
,
PassThrough
,
PassThrough
>
{}(
gemmPtr
);
}
int
main
()
{
std
::
vector
<
DeviceGemmPtr_
>
gemmPtrs
;
gemmPtrs
.
clear
();
ck
::
tensor_operation
::
device
::
device_gemm_instance
::
add_device_gemm_xdl_c_shuffle_bf16_bf16_bf16_mk_nk_mn_instances
(
gemmPtrs
);
bool
res
=
true
;
for
(
auto
&
gemmPtr
:
gemmPtrs
)
{
res
&=
TestGemm
(
gemmPtr
);
res
&=
ck
::
gemm_util
::
TestGemmBF16
<
DeviceGemmNoOpPtr
,
RowMajor
,
ColumnMajor
,
RowMajor
,
PassThrough
,
PassThrough
,
PassThrough
>
{}(
gemmPtr
);
}
std
::
cout
<<
"TestGemm ..... "
<<
(
res
?
"SUCCESS"
:
"FAILURE"
)
<<
std
::
endl
;
return
res
?
0
:
1
;
}
test/gemm/gemm_fp16.cpp
0 → 100644
View file @
dd6a8de4
#include <algorithm>
#include <cstdlib>
#include <half.hpp>
#include <iostream>
#include <numeric>
#include <tuple>
#include <vector>
#include "gemm_util.hpp"
#include "config.hpp"
#include "print.hpp"
#include "device.hpp"
#include "host_gemm.hpp"
#include "device_tensor.hpp"
#include "device_gemm_xdl.hpp"
#include "device_gemm_xdl_c_shuffle.hpp"
#include "element_wise_operation.hpp"
#include "gemm_specialization.hpp"
using
PassThrough
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
DeviceGemmNoOpPtr
=
ck
::
tensor_operation
::
device
::
DeviceGemmPtr
<
ck
::
tensor_operation
::
element_wise
::
PassThrough
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
>
;
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
device_gemm_instance
{
void
add_device_gemm_xdl_f16_f16_f16_km_kn_mn_instances
(
std
::
vector
<
DeviceGemmNoOpPtr
>&
);
void
add_device_gemm_xdl_f16_f16_f16_km_nk_mn_instances
(
std
::
vector
<
DeviceGemmNoOpPtr
>&
);
void
add_device_gemm_xdl_f16_f16_f16_mk_nk_mn_instances
(
std
::
vector
<
DeviceGemmNoOpPtr
>&
);
void
add_device_gemm_xdl_f16_f16_f16_mk_kn_mn_instances
(
std
::
vector
<
DeviceGemmNoOpPtr
>&
);
void
add_device_gemm_xdl_splitk_f16_f16_f16_km_kn_mn_instances
(
std
::
vector
<
DeviceGemmNoOpPtr
>&
);
void
add_device_gemm_xdl_splitk_f16_f16_f16_km_nk_mn_instances
(
std
::
vector
<
DeviceGemmNoOpPtr
>&
);
void
add_device_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_instances
(
std
::
vector
<
DeviceGemmNoOpPtr
>&
);
void
add_device_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_instances
(
std
::
vector
<
DeviceGemmNoOpPtr
>&
);
void
add_device_gemm_xdl_c_shuffle_f16_f16_f16_km_kn_mn_instances
(
std
::
vector
<
DeviceGemmNoOpPtr
>&
);
void
add_device_gemm_xdl_c_shuffle_f16_f16_f16_km_nk_mn_instances
(
std
::
vector
<
DeviceGemmNoOpPtr
>&
);
void
add_device_gemm_xdl_c_shuffle_f16_f16_f16_mk_nk_mn_instances
(
std
::
vector
<
DeviceGemmNoOpPtr
>&
);
void
add_device_gemm_xdl_c_shuffle_f16_f16_f16_mk_kn_mn_instances
(
std
::
vector
<
DeviceGemmNoOpPtr
>&
);
void
add_device_gemm_xdl_c_shuffle_2_stage_f16_f16_f16_mk_nk_mn_instances
(
std
::
vector
<
DeviceGemmNoOpPtr
>&
);
}
// namespace device_gemm_instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
int
main
()
{
using
ADataType
=
ck
::
half_t
;
using
BDataType
=
ck
::
half_t
;
using
CDataType
=
ck
::
half_t
;
using
RowMajor
=
ck
::
tensor_layout
::
gemm
::
RowMajor
;
using
ColumnMajor
=
ck
::
tensor_layout
::
gemm
::
ColumnMajor
;
bool
res
=
true
;
std
::
vector
<
DeviceGemmNoOpPtr
>
gemmPtrs
;
ck
::
tensor_operation
::
device
::
device_gemm_instance
::
add_device_gemm_xdl_f16_f16_f16_km_kn_mn_instances
(
gemmPtrs
);
ck
::
tensor_operation
::
device
::
device_gemm_instance
::
add_device_gemm_xdl_splitk_f16_f16_f16_km_kn_mn_instances
(
gemmPtrs
);
ck
::
tensor_operation
::
device
::
device_gemm_instance
::
add_device_gemm_xdl_c_shuffle_f16_f16_f16_km_kn_mn_instances
(
gemmPtrs
);
for
(
auto
&
gemmPtr
:
gemmPtrs
)
{
res
&=
ck
::
gemm_util
::
TestGemm
<
DeviceGemmNoOpPtr
,
ADataType
,
BDataType
,
CDataType
,
ColumnMajor
,
RowMajor
,
RowMajor
,
PassThrough
,
PassThrough
,
PassThrough
>
{}(
gemmPtr
);
}
gemmPtrs
.
clear
();
ck
::
tensor_operation
::
device
::
device_gemm_instance
::
add_device_gemm_xdl_f16_f16_f16_km_nk_mn_instances
(
gemmPtrs
);
ck
::
tensor_operation
::
device
::
device_gemm_instance
::
add_device_gemm_xdl_splitk_f16_f16_f16_km_nk_mn_instances
(
gemmPtrs
);
ck
::
tensor_operation
::
device
::
device_gemm_instance
::
add_device_gemm_xdl_c_shuffle_f16_f16_f16_km_nk_mn_instances
(
gemmPtrs
);
for
(
auto
&
gemmPtr
:
gemmPtrs
)
{
res
&=
ck
::
gemm_util
::
TestGemm
<
DeviceGemmNoOpPtr
,
ADataType
,
BDataType
,
CDataType
,
ColumnMajor
,
ColumnMajor
,
RowMajor
,
PassThrough
,
PassThrough
,
PassThrough
>
{}(
gemmPtr
);
}
gemmPtrs
.
clear
();
ck
::
tensor_operation
::
device
::
device_gemm_instance
::
add_device_gemm_xdl_f16_f16_f16_mk_kn_mn_instances
(
gemmPtrs
);
ck
::
tensor_operation
::
device
::
device_gemm_instance
::
add_device_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_instances
(
gemmPtrs
);
ck
::
tensor_operation
::
device
::
device_gemm_instance
::
add_device_gemm_xdl_c_shuffle_f16_f16_f16_mk_kn_mn_instances
(
gemmPtrs
);
for
(
auto
&
gemmPtr
:
gemmPtrs
)
{
res
&=
ck
::
gemm_util
::
TestGemm
<
DeviceGemmNoOpPtr
,
ADataType
,
BDataType
,
CDataType
,
RowMajor
,
RowMajor
,
RowMajor
,
PassThrough
,
PassThrough
,
PassThrough
>
{}(
gemmPtr
);
}
gemmPtrs
.
clear
();
ck
::
tensor_operation
::
device
::
device_gemm_instance
::
add_device_gemm_xdl_f16_f16_f16_mk_nk_mn_instances
(
gemmPtrs
);
ck
::
tensor_operation
::
device
::
device_gemm_instance
::
add_device_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_instances
(
gemmPtrs
);
ck
::
tensor_operation
::
device
::
device_gemm_instance
::
add_device_gemm_xdl_c_shuffle_f16_f16_f16_mk_nk_mn_instances
(
gemmPtrs
);
ck
::
tensor_operation
::
device
::
device_gemm_instance
::
add_device_gemm_xdl_c_shuffle_2_stage_f16_f16_f16_mk_nk_mn_instances
(
gemmPtrs
);
for
(
auto
&
gemmPtr
:
gemmPtrs
)
{
res
&=
ck
::
gemm_util
::
TestGemm
<
DeviceGemmNoOpPtr
,
ADataType
,
BDataType
,
CDataType
,
RowMajor
,
ColumnMajor
,
RowMajor
,
PassThrough
,
PassThrough
,
PassThrough
>
{}(
gemmPtr
);
}
std
::
cout
<<
"TestGemm ..... "
<<
(
res
?
"SUCCESS"
:
"FAILURE"
)
<<
std
::
endl
;
return
res
?
0
:
1
;
}
test/gemm/gemm_fp32.cpp
View file @
dd6a8de4
...
...
@@ -19,11 +19,10 @@
#include "element_wise_operation.hpp"
#include "reference_gemm.hpp"
#include "gemm_specialization.hpp"
#include "test_util.hpp"
using
PassThrough
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
DeviceGemmPtr
_
=
using
DeviceGemm
NoOp
Ptr
=
ck
::
tensor_operation
::
device
::
DeviceGemmPtr
<
ck
::
tensor_operation
::
element_wise
::
PassThrough
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
>
;
...
...
@@ -32,107 +31,124 @@ namespace ck {
namespace
tensor_operation
{
namespace
device
{
namespace
device_gemm_instance
{
void
add_device_gemm_xdl_f32_f32_f32_mk_nk_mn_instances
(
std
::
vector
<
DeviceGemmPtr_
>&
);
}
void
add_device_gemm_xdl_f32_f32_f32_km_kn_mn_instances
(
std
::
vector
<
DeviceGemmNoOpPtr
>&
);
void
add_device_gemm_xdl_f32_f32_f32_km_nk_mn_instances
(
std
::
vector
<
DeviceGemmNoOpPtr
>&
);
void
add_device_gemm_xdl_f32_f32_f32_mk_nk_mn_instances
(
std
::
vector
<
DeviceGemmNoOpPtr
>&
);
void
add_device_gemm_xdl_f32_f32_f32_mk_kn_mn_instances
(
std
::
vector
<
DeviceGemmNoOpPtr
>&
);
void
add_device_gemm_xdl_splitk_f32_f32_f32_km_kn_mn_instances
(
std
::
vector
<
DeviceGemmNoOpPtr
>&
);
void
add_device_gemm_xdl_splitk_f32_f32_f32_km_nk_mn_instances
(
std
::
vector
<
DeviceGemmNoOpPtr
>&
);
void
add_device_gemm_xdl_splitk_f32_f32_f32_mk_nk_mn_instances
(
std
::
vector
<
DeviceGemmNoOpPtr
>&
);
void
add_device_gemm_xdl_splitk_f32_f32_f32_mk_kn_mn_instances
(
std
::
vector
<
DeviceGemmNoOpPtr
>&
);
void
add_device_gemm_xdl_c_shuffle_f32_f32_f32_km_kn_mn_instances
(
std
::
vector
<
DeviceGemmNoOpPtr
>&
);
void
add_device_gemm_xdl_c_shuffle_f32_f32_f32_km_nk_mn_instances
(
std
::
vector
<
DeviceGemmNoOpPtr
>&
);
void
add_device_gemm_xdl_c_shuffle_f32_f32_f32_mk_nk_mn_instances
(
std
::
vector
<
DeviceGemmNoOpPtr
>&
);
void
add_device_gemm_xdl_c_shuffle_f32_f32_f32_mk_kn_mn_instances
(
std
::
vector
<
DeviceGemmNoOpPtr
>&
);
}
// namespace device_gemm_instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
namespace
{
int
main
()
{
using
ADataType
=
float
;
using
BDataType
=
float
;
using
CDataType
=
float
;
using
ADataType
=
float
;
using
BDataType
=
float
;
using
CDataType
=
float
;
using
AccDataType
=
float
;
using
RowMajor
=
ck
::
tensor_layout
::
gemm
::
RowMajor
;
using
ColumnMajor
=
ck
::
tensor_layout
::
gemm
::
ColumnMajor
;
using
ALayout
=
ck
::
tensor_layout
::
gemm
::
RowMajor
;
using
BLayout
=
ck
::
tensor_layout
::
gemm
::
ColumnMajor
;
using
CLayout
=
ck
::
tensor_layout
::
gemm
::
RowMajor
;
bool
res
=
true
;
std
::
vector
<
DeviceGemmNoOpPtr
>
gemmPtrs
;
ck
::
tensor_operation
::
device
::
device_gemm_instance
::
add_device_gemm_xdl_f32_f32_f32_km_kn_mn_instances
(
gemmPtrs
);
ck
::
tensor_operation
::
device
::
device_gemm_instance
::
add_device_gemm_xdl_splitk_f32_f32_f32_km_kn_mn_instances
(
gemmPtrs
);
ck
::
tensor_operation
::
device
::
device_gemm_instance
::
add_device_gemm_xdl_c_shuffle_f32_f32_f32_km_kn_mn_instances
(
gemmPtrs
);
auto
PrepareGemmTensor
(
const
ck
::
gemm_util
::
GemmParams
&
params
)
{
auto
f_host_tensor_descriptor
=
[](
std
::
size_t
row
,
std
::
size_t
col
,
std
::
size_t
stride
,
auto
layout
)
{
if
(
std
::
is_same
<
decltype
(
layout
),
ck
::
tensor_layout
::
gemm
::
RowMajor
>::
value
)
{
return
HostTensorDescriptor
(
std
::
vector
<
std
::
size_t
>
({
row
,
col
}),
std
::
vector
<
std
::
size_t
>
({
stride
,
1
}));
}
else
{
return
HostTensorDescriptor
(
std
::
vector
<
std
::
size_t
>
({
row
,
col
}),
std
::
vector
<
std
::
size_t
>
({
1
,
stride
}));
}
};
Tensor
<
ADataType
>
a_m_k
(
f_host_tensor_descriptor
(
params
.
M
,
params
.
K
,
params
.
StrideA
,
ALayout
{}));
Tensor
<
BDataType
>
b_k_n
(
f_host_tensor_descriptor
(
params
.
K
,
params
.
N
,
params
.
StrideB
,
BLayout
{}));
Tensor
<
CDataType
>
c_m_n_host_result
(
f_host_tensor_descriptor
(
params
.
M
,
params
.
N
,
params
.
StrideC
,
CLayout
{}));
Tensor
<
CDataType
>
c_m_n_device_result
(
f_host_tensor_descriptor
(
params
.
M
,
params
.
N
,
params
.
StrideC
,
CLayout
{}));
a_m_k
.
GenerateTensorValue
(
GeneratorTensor_3
<
ADataType
>
{
-
0.5
,
0.5
});
b_k_n
.
GenerateTensorValue
(
GeneratorTensor_3
<
BDataType
>
{
-
0.5
,
0.5
});
return
std
::
make_tuple
(
a_m_k
,
b_k_n
,
c_m_n_host_result
,
c_m_n_device_result
);
}
for
(
auto
&
gemmPtr
:
gemmPtrs
)
{
res
&=
ck
::
gemm_util
::
TestGemm
<
DeviceGemmNoOpPtr
,
ADataType
,
BDataType
,
CDataType
,
ColumnMajor
,
RowMajor
,
RowMajor
,
PassThrough
,
PassThrough
,
PassThrough
>
{}(
gemmPtr
);
}
bool
TestGemm
(
DeviceGemmPtr_
&
gemmPtr
)
{
// Arrange
ck
::
gemm_util
::
GemmParams
params
;
params
.
M
=
1024
;
params
.
N
=
1024
;
params
.
K
=
1024
;
params
.
StrideA
=
1024
;
params
.
StrideB
=
1024
;
params
.
StrideC
=
1024
;
auto
host_tensors
=
PrepareGemmTensor
(
params
);
const
Tensor
<
ADataType
>&
a
=
std
::
get
<
0
>
(
host_tensors
);
const
Tensor
<
BDataType
>&
b
=
std
::
get
<
1
>
(
host_tensors
);
Tensor
<
CDataType
>&
c_host
=
std
::
get
<
2
>
(
host_tensors
);
Tensor
<
CDataType
>&
c_device
=
std
::
get
<
3
>
(
host_tensors
);
auto
a_element_op
=
PassThrough
{};
auto
b_element_op
=
PassThrough
{};
auto
c_element_op
=
PassThrough
{};
using
ReferenceGemmInstance
=
ck
::
tensor_operation
::
host
::
ReferenceGemm
<
ADataType
,
BDataType
,
CDataType
,
PassThrough
,
PassThrough
,
PassThrough
>
;
ck
::
gemm_util
::
RunHostGEMM
<
ReferenceGemmInstance
>
(
a
,
b
,
c_host
,
a_element_op
,
b_element_op
,
c_element_op
);
// Act
ck
::
gemm_util
::
RunDeviceGEMM
(
gemmPtr
,
params
,
a
,
b
,
c_device
,
a_element_op
,
b_element_op
,
c_element_op
);
// Assert
bool
res
=
test_util
::
check_err
(
c_device
.
mData
,
c_host
.
mData
,
"Error: incorrect results!"
,
1e-5
f
,
1e-4
f
);
std
::
cout
<<
(
res
?
"SUCCESS"
:
"FAILURE"
)
<<
std
::
endl
;
return
res
;
}
gemmPtrs
.
clear
();
ck
::
tensor_operation
::
device
::
device_gemm_instance
::
add_device_gemm_xdl_f32_f32_f32_km_nk_mn_instances
(
gemmPtrs
);
ck
::
tensor_operation
::
device
::
device_gemm_instance
::
add_device_gemm_xdl_splitk_f32_f32_f32_km_nk_mn_instances
(
gemmPtrs
);
ck
::
tensor_operation
::
device
::
device_gemm_instance
::
add_device_gemm_xdl_c_shuffle_f32_f32_f32_km_nk_mn_instances
(
gemmPtrs
);
}
// anonymous namespace
for
(
auto
&
gemmPtr
:
gemmPtrs
)
{
res
&=
ck
::
gemm_util
::
TestGemm
<
DeviceGemmNoOpPtr
,
ADataType
,
BDataType
,
CDataType
,
ColumnMajor
,
ColumnMajor
,
RowMajor
,
PassThrough
,
PassThrough
,
PassThrough
>
{}(
gemmPtr
);
}
int
main
()
{
std
::
vector
<
DeviceGemmPtr_
>
gemmPtrs
;
gemmPtrs
.
clear
();
ck
::
tensor_operation
::
device
::
device_gemm_instance
::
add_device_gemm_xdl_f32_f32_f32_mk_nk_mn_instances
(
gemmPtrs
);
add_device_gemm_xdl_f32_f32_f32_mk_kn_mn_instances
(
gemmPtrs
);
ck
::
tensor_operation
::
device
::
device_gemm_instance
::
add_device_gemm_xdl_splitk_f32_f32_f32_mk_kn_mn_instances
(
gemmPtrs
);
ck
::
tensor_operation
::
device
::
device_gemm_instance
::
add_device_gemm_xdl_c_shuffle_f32_f32_f32_mk_kn_mn_instances
(
gemmPtrs
);
bool
res
=
true
;
for
(
auto
&
gemmPtr
:
gemmPtrs
)
{
res
&=
ck
::
gemm_util
::
TestGemm
<
DeviceGemmNoOpPtr
,
ADataType
,
BDataType
,
CDataType
,
RowMajor
,
RowMajor
,
RowMajor
,
PassThrough
,
PassThrough
,
PassThrough
>
{}(
gemmPtr
);
}
gemmPtrs
.
clear
();
ck
::
tensor_operation
::
device
::
device_gemm_instance
::
add_device_gemm_xdl_f32_f32_f32_mk_nk_mn_instances
(
gemmPtrs
);
ck
::
tensor_operation
::
device
::
device_gemm_instance
::
add_device_gemm_xdl_splitk_f32_f32_f32_mk_nk_mn_instances
(
gemmPtrs
);
ck
::
tensor_operation
::
device
::
device_gemm_instance
::
add_device_gemm_xdl_c_shuffle_f32_f32_f32_mk_nk_mn_instances
(
gemmPtrs
);
for
(
auto
&
gemmPtr
:
gemmPtrs
)
{
res
&=
TestGemm
(
gemmPtr
);
res
&=
ck
::
gemm_util
::
TestGemm
<
DeviceGemmNoOpPtr
,
ADataType
,
BDataType
,
CDataType
,
RowMajor
,
ColumnMajor
,
RowMajor
,
PassThrough
,
PassThrough
,
PassThrough
>
{}(
gemmPtr
);
}
std
::
cout
<<
"TestGemm ..... "
<<
(
res
?
"SUCCESS"
:
"FAILURE"
)
<<
std
::
endl
;
return
res
?
0
:
1
;
}
test/gemm/gemm_int8.cpp
View file @
dd6a8de4
...
...
@@ -19,11 +19,10 @@
#include "element_wise_operation.hpp"
#include "reference_gemm.hpp"
#include "gemm_specialization.hpp"
#include "test_util.hpp"
using
PassThrough
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
DeviceGemmPtr
_
=
using
DeviceGemm
NoOp
Ptr
=
ck
::
tensor_operation
::
device
::
DeviceGemmPtr
<
ck
::
tensor_operation
::
element_wise
::
PassThrough
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
>
;
...
...
@@ -32,106 +31,102 @@ namespace ck {
namespace
tensor_operation
{
namespace
device
{
namespace
device_gemm_instance
{
void
add_device_gemm_xdl_c_shuffle_int8_int8_int8_mk_nk_mn_instances
(
std
::
vector
<
DeviceGemmPtr_
>&
);
}
void
add_device_gemm_xdl_c_shuffle_int8_int8_int8_km_kn_mn_instances
(
std
::
vector
<
DeviceGemmNoOpPtr
>&
);
void
add_device_gemm_xdl_c_shuffle_int8_int8_int8_km_nk_mn_instances
(
std
::
vector
<
DeviceGemmNoOpPtr
>&
);
void
add_device_gemm_xdl_c_shuffle_int8_int8_int8_mk_nk_mn_instances
(
std
::
vector
<
DeviceGemmNoOpPtr
>&
);
void
add_device_gemm_xdl_c_shuffle_int8_int8_int8_mk_kn_mn_instances
(
std
::
vector
<
DeviceGemmNoOpPtr
>&
);
}
// namespace device_gemm_instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
namespace
{
int
main
()
{
using
ADataType
=
int8_t
;
using
BDataType
=
int8_t
;
using
CDataType
=
int8_t
;
using
ADataType
=
int8_t
;
using
BDataType
=
int8_t
;
using
CDataType
=
int8_t
;
using
AccDataType
=
int32_t
;
using
RowMajor
=
ck
::
tensor_layout
::
gemm
::
RowMajor
;
using
ColumnMajor
=
ck
::
tensor_layout
::
gemm
::
ColumnMajor
;
using
ALayout
=
ck
::
tensor_layout
::
gemm
::
RowMajor
;
using
BLayout
=
ck
::
tensor_layout
::
gemm
::
ColumnMajor
;
using
CLayout
=
ck
::
tensor_layout
::
gemm
::
RowMajor
;
std
::
vector
<
DeviceGemmNoOpPtr
>
gemmPtrs
;
bool
res
=
true
;
auto
PrepareGemmTensor
(
const
ck
::
gemm_util
::
GemmParams
&
params
)
{
auto
f_host_tensor_descriptor
=
[](
std
::
size_t
row
,
std
::
size_t
col
,
std
::
size_t
stride
,
auto
layout
)
{
if
(
std
::
is_same
<
decltype
(
layout
),
ck
::
tensor_layout
::
gemm
::
RowMajor
>::
value
)
{
return
HostTensorDescriptor
(
std
::
vector
<
std
::
size_t
>
({
row
,
col
}),
std
::
vector
<
std
::
size_t
>
({
stride
,
1
}));
}
else
{
return
HostTensorDescriptor
(
std
::
vector
<
std
::
size_t
>
({
row
,
col
}),
std
::
vector
<
std
::
size_t
>
({
1
,
stride
}));
}
};
Tensor
<
ADataType
>
a_m_k
(
f_host_tensor_descriptor
(
params
.
M
,
params
.
K
,
params
.
StrideA
,
ALayout
{}));
Tensor
<
BDataType
>
b_k_n
(
f_host_tensor_descriptor
(
params
.
K
,
params
.
N
,
params
.
StrideB
,
BLayout
{}));
Tensor
<
CDataType
>
c_m_n_host_result
(
f_host_tensor_descriptor
(
params
.
M
,
params
.
N
,
params
.
StrideC
,
CLayout
{}));
Tensor
<
CDataType
>
c_m_n_device_result
(
f_host_tensor_descriptor
(
params
.
M
,
params
.
N
,
params
.
StrideC
,
CLayout
{}));
a_m_k
.
GenerateTensorValue
(
GeneratorTensor_2
<
ADataType
>
{
-
5
,
5
});
b_k_n
.
GenerateTensorValue
(
GeneratorTensor_2
<
BDataType
>
{
-
5
,
5
});
return
std
::
make_tuple
(
a_m_k
,
b_k_n
,
c_m_n_host_result
,
c_m_n_device_result
);
}
ck
::
tensor_operation
::
device
::
device_gemm_instance
::
add_device_gemm_xdl_c_shuffle_int8_int8_int8_km_kn_mn_instances
(
gemmPtrs
);
bool
TestGemm
(
DeviceGemmPtr_
&
gemmPtr
)
{
// Arrange
ck
::
gemm_util
::
GemmParams
params
;
params
.
M
=
1024
;
params
.
N
=
1024
;
params
.
K
=
1024
;
params
.
StrideA
=
1024
;
params
.
StrideB
=
1024
;
params
.
StrideC
=
1024
;
auto
host_tensors
=
PrepareGemmTensor
(
params
);
const
Tensor
<
ADataType
>&
a
=
std
::
get
<
0
>
(
host_tensors
);
const
Tensor
<
BDataType
>&
b
=
std
::
get
<
1
>
(
host_tensors
);
Tensor
<
CDataType
>&
c_host
=
std
::
get
<
2
>
(
host_tensors
);
Tensor
<
CDataType
>&
c_device
=
std
::
get
<
3
>
(
host_tensors
);
auto
a_element_op
=
PassThrough
{};
auto
b_element_op
=
PassThrough
{};
auto
c_element_op
=
PassThrough
{};
using
ReferenceGemmInstance
=
ck
::
tensor_operation
::
host
::
ReferenceGemm
<
ADataType
,
BDataType
,
CDataType
,
PassThrough
,
PassThrough
,
PassThrough
>
;
ck
::
gemm_util
::
RunHostGEMM
<
ReferenceGemmInstance
>
(
a
,
b
,
c_host
,
a_element_op
,
b_element_op
,
c_element_op
);
// Act
ck
::
gemm_util
::
RunDeviceGEMM
(
gemmPtr
,
params
,
a
,
b
,
c_device
,
a_element_op
,
b_element_op
,
c_element_op
);
// Assert
bool
res
=
test_util
::
check_err
(
c_device
.
mData
,
c_host
.
mData
,
"Error: incorrect results!"
);
std
::
cout
<<
(
res
?
"SUCCESS"
:
"FAILURE"
)
<<
std
::
endl
;
return
res
;
}
for
(
auto
&
gemmPtr
:
gemmPtrs
)
{
res
&=
ck
::
gemm_util
::
TestGemm
<
DeviceGemmNoOpPtr
,
ADataType
,
BDataType
,
CDataType
,
ColumnMajor
,
RowMajor
,
RowMajor
,
PassThrough
,
PassThrough
,
PassThrough
>
{}(
gemmPtr
);
}
}
// anonymous namespace
gemmPtrs
.
clear
();
ck
::
tensor_operation
::
device
::
device_gemm_instance
::
add_device_gemm_xdl_c_shuffle_int8_int8_int8_km_nk_mn_instances
(
gemmPtrs
);
int
main
()
{
std
::
vector
<
DeviceGemmPtr_
>
gemmPtrs
;
for
(
auto
&
gemmPtr
:
gemmPtrs
)
{
res
&=
ck
::
gemm_util
::
TestGemm
<
DeviceGemmNoOpPtr
,
ADataType
,
BDataType
,
CDataType
,
ColumnMajor
,
ColumnMajor
,
RowMajor
,
PassThrough
,
PassThrough
,
PassThrough
>
{}(
gemmPtr
);
}
gemmPtrs
.
clear
();
ck
::
tensor_operation
::
device
::
device_gemm_instance
::
add_device_gemm_xdl_c_shuffle_int8_int8_int8_mk_
n
k_mn_instances
(
gemmPtrs
);
add_device_gemm_xdl_c_shuffle_int8_int8_int8_mk_k
n
_mn_instances
(
gemmPtrs
);
bool
res
=
true
;
for
(
auto
&
gemmPtr
:
gemmPtrs
)
{
res
&=
ck
::
gemm_util
::
TestGemm
<
DeviceGemmNoOpPtr
,
ADataType
,
BDataType
,
CDataType
,
RowMajor
,
RowMajor
,
RowMajor
,
PassThrough
,
PassThrough
,
PassThrough
>
{}(
gemmPtr
);
}
gemmPtrs
.
clear
();
ck
::
tensor_operation
::
device
::
device_gemm_instance
::
add_device_gemm_xdl_c_shuffle_int8_int8_int8_mk_nk_mn_instances
(
gemmPtrs
);
for
(
auto
&
gemmPtr
:
gemmPtrs
)
{
res
&=
TestGemm
(
gemmPtr
);
res
&=
ck
::
gemm_util
::
TestGemm
<
DeviceGemmNoOpPtr
,
ADataType
,
BDataType
,
CDataType
,
RowMajor
,
ColumnMajor
,
RowMajor
,
PassThrough
,
PassThrough
,
PassThrough
>
{}(
gemmPtr
);
}
std
::
cout
<<
"TestGemm ..... "
<<
(
res
?
"SUCCESS"
:
"FAILURE"
)
<<
std
::
endl
;
return
res
?
0
:
1
;
}
test/gemm/gemm_util.hpp
View file @
dd6a8de4
#ifndef GEMM_UTILS_HPP
#define GEMM_UTILS_HPP
#include "check_err.hpp"
#include "config.hpp"
#include "device.hpp"
#include "host_tensor.hpp"
#include "host_tensor_generator.hpp"
#include "reference_gemm.hpp"
#include "tensor_layout.hpp"
namespace
ck
{
namespace
gemm_util
{
...
...
@@ -98,6 +102,239 @@ void RunDeviceGEMM(DeviceGemmPtr_& gemmPtr,
c_m_n_device_buf
.
FromDevice
(
C
.
mData
.
data
());
}
template
<
typename
DeviceGemmPtr_
,
typename
ADataType
,
typename
BDataType
,
typename
CDataType
,
typename
ALayout
,
typename
BLayout
,
typename
CLayout
,
typename
AElementwiseOperation
,
typename
BElementwiseOperation
,
typename
CElementwiseOperation
>
struct
TestGemm
{
auto
PrepareGemmTensor
(
const
ck
::
gemm_util
::
GemmParams
&
params
)
{
auto
f_host_tensor_descriptor
=
[](
std
::
size_t
row
,
std
::
size_t
col
,
std
::
size_t
stride
,
auto
layout
)
{
if
(
std
::
is_same
<
decltype
(
layout
),
ck
::
tensor_layout
::
gemm
::
RowMajor
>::
value
)
{
return
HostTensorDescriptor
(
std
::
vector
<
std
::
size_t
>
({
row
,
col
}),
std
::
vector
<
std
::
size_t
>
({
stride
,
1
}));
}
else
{
return
HostTensorDescriptor
(
std
::
vector
<
std
::
size_t
>
({
row
,
col
}),
std
::
vector
<
std
::
size_t
>
({
1
,
stride
}));
}
};
Tensor
<
ADataType
>
a_m_k
(
f_host_tensor_descriptor
(
params
.
M
,
params
.
K
,
params
.
StrideA
,
ALayout
{}));
Tensor
<
BDataType
>
b_k_n
(
f_host_tensor_descriptor
(
params
.
K
,
params
.
N
,
params
.
StrideB
,
BLayout
{}));
Tensor
<
CDataType
>
c_m_n_host_result
(
f_host_tensor_descriptor
(
params
.
M
,
params
.
N
,
params
.
StrideC
,
CLayout
{}));
Tensor
<
CDataType
>
c_m_n_device_result
(
f_host_tensor_descriptor
(
params
.
M
,
params
.
N
,
params
.
StrideC
,
CLayout
{}));
auto
f_generate_tensor_value
=
[](
auto
desc
,
auto
type
)
{
using
dataType
=
decltype
(
type
);
if
(
std
::
is_same
<
dataType
,
int8_t
>::
value
)
{
desc
.
GenerateTensorValue
(
GeneratorTensor_2
<
int8_t
>
{
-
5
,
5
});
}
else
{
desc
.
GenerateTensorValue
(
GeneratorTensor_3
<
dataType
>
{
-
0.5
,
0.5
});
}
};
f_generate_tensor_value
(
a_m_k
,
ADataType
{});
f_generate_tensor_value
(
b_k_n
,
BDataType
{});
return
std
::
make_tuple
(
a_m_k
,
b_k_n
,
c_m_n_host_result
,
c_m_n_device_result
);
}
auto
operator
()(
DeviceGemmPtr_
&
gemmPtr
)
{
std
::
cout
<<
"ALayout = "
<<
ALayout
{}.
name
<<
", BLayout = "
<<
BLayout
{}.
name
<<
", CLayout = "
<<
CLayout
{}.
name
<<
std
::
endl
;
std
::
cout
<<
gemmPtr
->
GetTypeString
()
<<
std
::
endl
;
// Arrange
ck
::
gemm_util
::
GemmParams
params
;
params
.
M
=
1024
;
params
.
N
=
1024
;
params
.
K
=
1024
;
params
.
StrideA
=
1024
;
params
.
StrideB
=
1024
;
params
.
StrideC
=
1024
;
auto
host_tensors
=
PrepareGemmTensor
(
params
);
const
Tensor
<
ADataType
>&
a
=
std
::
get
<
0
>
(
host_tensors
);
const
Tensor
<
BDataType
>&
b
=
std
::
get
<
1
>
(
host_tensors
);
Tensor
<
CDataType
>&
c_host
=
std
::
get
<
2
>
(
host_tensors
);
Tensor
<
CDataType
>&
c_device
=
std
::
get
<
3
>
(
host_tensors
);
auto
a_element_op
=
AElementwiseOperation
{};
auto
b_element_op
=
BElementwiseOperation
{};
auto
c_element_op
=
CElementwiseOperation
{};
using
ReferenceGemmInstance
=
ck
::
tensor_operation
::
host
::
ReferenceGemm
<
ADataType
,
BDataType
,
CDataType
,
AElementwiseOperation
,
BElementwiseOperation
,
CElementwiseOperation
>
;
ck
::
gemm_util
::
RunHostGEMM
<
ReferenceGemmInstance
>
(
a
,
b
,
c_host
,
a_element_op
,
b_element_op
,
c_element_op
);
// Act
ck
::
gemm_util
::
RunDeviceGEMM
(
gemmPtr
,
params
,
a
,
b
,
c_device
,
a_element_op
,
b_element_op
,
c_element_op
);
// Assert
bool
res
=
false
;
if
(
std
::
is_same
<
CDataType
,
float
>::
value
)
{
res
=
ck
::
utils
::
check_err
(
c_device
.
mData
,
c_host
.
mData
);
std
::
cout
<<
(
res
?
"SUCCESS"
:
"FAILURE"
)
<<
std
::
endl
;
}
else
if
(
std
::
is_same
<
CDataType
,
ck
::
half_t
>::
value
)
{
res
=
ck
::
utils
::
check_err
(
c_device
.
mData
,
c_host
.
mData
);
std
::
cout
<<
(
res
?
"SUCCESS"
:
"FAILURE"
)
<<
std
::
endl
;
}
else
if
(
std
::
is_same
<
CDataType
,
int8_t
>::
value
)
{
res
=
ck
::
utils
::
check_err
(
c_device
.
mData
,
c_host
.
mData
);
std
::
cout
<<
(
res
?
"SUCCESS"
:
"FAILURE"
)
<<
std
::
endl
;
}
return
res
;
}
};
template
<
typename
DeviceGemmPtr_
,
typename
ALayout
,
typename
BLayout
,
typename
CLayout
,
typename
AElementwiseOperation
,
typename
BElementwiseOperation
,
typename
CElementwiseOperation
>
struct
TestGemmBF16
{
using
BF16
=
ck
::
bhalf_t
;
auto
PrepareGemmTensorBF16
(
const
ck
::
gemm_util
::
GemmParams
&
params
)
{
auto
f_host_tensor_descriptor
=
[](
std
::
size_t
row
,
std
::
size_t
col
,
std
::
size_t
stride
,
auto
layout
)
{
if
(
std
::
is_same
<
decltype
(
layout
),
ck
::
tensor_layout
::
gemm
::
RowMajor
>::
value
)
{
return
HostTensorDescriptor
(
std
::
vector
<
std
::
size_t
>
({
row
,
col
}),
std
::
vector
<
std
::
size_t
>
({
stride
,
1
}));
}
else
{
return
HostTensorDescriptor
(
std
::
vector
<
std
::
size_t
>
({
row
,
col
}),
std
::
vector
<
std
::
size_t
>
({
1
,
stride
}));
}
};
// use fp32 host kernel to verify bf16 device kernel
Tensor
<
BF16
>
a_m_k_bf16
(
f_host_tensor_descriptor
(
params
.
M
,
params
.
K
,
params
.
StrideA
,
ALayout
{}));
Tensor
<
BF16
>
b_k_n_bf16
(
f_host_tensor_descriptor
(
params
.
K
,
params
.
N
,
params
.
StrideB
,
BLayout
{}));
Tensor
<
BF16
>
c_m_n_device_bf16
(
f_host_tensor_descriptor
(
params
.
M
,
params
.
N
,
params
.
StrideC
,
CLayout
{}));
Tensor
<
float
>
a_m_k_fp32
(
f_host_tensor_descriptor
(
params
.
M
,
params
.
K
,
params
.
StrideA
,
ALayout
{}));
Tensor
<
float
>
b_k_n_fp32
(
f_host_tensor_descriptor
(
params
.
K
,
params
.
N
,
params
.
StrideB
,
BLayout
{}));
Tensor
<
float
>
c_m_n_host_fp32
(
f_host_tensor_descriptor
(
params
.
M
,
params
.
N
,
params
.
StrideC
,
CLayout
{}));
Tensor
<
float
>
c_m_n_device_fp32
(
f_host_tensor_descriptor
(
params
.
M
,
params
.
N
,
params
.
StrideC
,
CLayout
{}));
a_m_k_bf16
.
GenerateTensorValue
(
GeneratorTensor_3
<
BF16
>
{
-
0.5
,
0.5
});
b_k_n_bf16
.
GenerateTensorValue
(
GeneratorTensor_3
<
BF16
>
{
-
0.5
,
0.5
});
bf16_to_f32_
(
a_m_k_bf16
,
a_m_k_fp32
);
bf16_to_f32_
(
b_k_n_bf16
,
b_k_n_fp32
);
return
std
::
make_tuple
(
a_m_k_bf16
,
b_k_n_bf16
,
c_m_n_device_bf16
,
a_m_k_fp32
,
b_k_n_fp32
,
c_m_n_host_fp32
,
c_m_n_device_fp32
);
}
auto
operator
()(
DeviceGemmPtr_
&
gemmPtr
)
{
// Arrange
ck
::
gemm_util
::
GemmParams
params
;
params
.
M
=
1024
;
params
.
N
=
1024
;
params
.
K
=
1024
;
params
.
StrideA
=
1024
;
params
.
StrideB
=
1024
;
params
.
StrideC
=
1024
;
auto
host_tensors
=
PrepareGemmTensorBF16
(
params
);
const
Tensor
<
BF16
>&
a_bf16
=
std
::
get
<
0
>
(
host_tensors
);
const
Tensor
<
BF16
>&
b_bf16
=
std
::
get
<
1
>
(
host_tensors
);
Tensor
<
BF16
>&
c_device_bf16
=
std
::
get
<
2
>
(
host_tensors
);
Tensor
<
float
>&
a_fp32
=
std
::
get
<
3
>
(
host_tensors
);
Tensor
<
float
>&
b_fp32
=
std
::
get
<
4
>
(
host_tensors
);
Tensor
<
float
>&
c_host_fp32
=
std
::
get
<
5
>
(
host_tensors
);
Tensor
<
float
>&
c_device_fp32
=
std
::
get
<
6
>
(
host_tensors
);
auto
a_element_op
=
AElementwiseOperation
{};
auto
b_element_op
=
BElementwiseOperation
{};
auto
c_element_op
=
CElementwiseOperation
{};
// use fp32 host kernel to verify bf16 device kernel
using
ReferenceGemmInstance
=
ck
::
tensor_operation
::
host
::
ReferenceGemm
<
float
,
float
,
float
,
AElementwiseOperation
,
BElementwiseOperation
,
CElementwiseOperation
>
;
ck
::
gemm_util
::
RunHostGEMM
<
ReferenceGemmInstance
>
(
a_fp32
,
b_fp32
,
c_host_fp32
,
a_element_op
,
b_element_op
,
c_element_op
);
// Act
ck
::
gemm_util
::
RunDeviceGEMM
(
gemmPtr
,
params
,
a_bf16
,
b_bf16
,
c_device_bf16
,
a_element_op
,
b_element_op
,
c_element_op
);
bf16_to_f32_
(
c_device_bf16
,
c_device_fp32
);
// Assert
bool
res
=
ck
::
utils
::
check_err
(
c_device_fp32
.
mData
,
c_host_fp32
.
mData
,
"Error: incorrect results!"
,
1e-2
f
,
1e-3
f
);
std
::
cout
<<
(
res
?
"SUCCESS"
:
"FAILURE"
)
<<
std
::
endl
;
return
res
;
};
};
}
// namespace gemm_util
}
// namespace ck
#endif
test/gemm_reduce/CMakeLists.txt
0 → 100644
View file @
dd6a8de4
include_directories
(
BEFORE
${
PROJECT_SOURCE_DIR
}
/profiler/include
${
PROJECT_SOURCE_DIR
}
/test/include
${
PROJECT_SOURCE_DIR
}
/external/include/half
)
add_test_executable
(
test_gemm_reduce_fp16 gemm_reduce_fp16.cpp
)
target_link_libraries
(
test_gemm_reduce_fp16 PRIVATE host_tensor
)
target_link_libraries
(
test_gemm_reduce_fp16 PRIVATE device_gemm_reduce_instance
)
test/gemm_reduce/gemm_reduce_fp16.cpp
0 → 100644
View file @
dd6a8de4
#include <iostream>
#include "profile_gemm_reduce_impl.hpp"
int
main
()
{
using
Row
=
ck
::
tensor_layout
::
gemm
::
RowMajor
;
using
Col
=
ck
::
tensor_layout
::
gemm
::
ColumnMajor
;
int
M
=
512
;
int
N
=
256
;
int
K
=
128
;
bool
pass
=
true
;
pass
=
pass
&&
ck
::
profiler
::
profile_gemm_reduce_impl
<
ck
::
half_t
,
ck
::
half_t
,
ck
::
half_t
,
float
,
Row
,
Row
,
Row
>
(
true
,
1
,
false
,
1
,
M
,
N
,
K
,
K
,
N
,
N
);
pass
=
pass
&&
ck
::
profiler
::
profile_gemm_reduce_impl
<
ck
::
half_t
,
ck
::
half_t
,
ck
::
half_t
,
float
,
Row
,
Col
,
Row
>
(
true
,
1
,
false
,
1
,
M
,
N
,
K
,
K
,
K
,
N
);
pass
=
pass
&&
ck
::
profiler
::
profile_gemm_reduce_impl
<
ck
::
half_t
,
ck
::
half_t
,
ck
::
half_t
,
float
,
Col
,
Row
,
Row
>
(
true
,
1
,
false
,
1
,
M
,
N
,
K
,
M
,
N
,
N
);
pass
=
pass
&&
ck
::
profiler
::
profile_gemm_reduce_impl
<
ck
::
half_t
,
ck
::
half_t
,
ck
::
half_t
,
float
,
Col
,
Col
,
Row
>
(
true
,
1
,
false
,
1
,
M
,
N
,
K
,
M
,
K
,
N
);
if
(
pass
)
{
std
::
cout
<<
"test GEMM+Reduce fp16: Pass"
<<
std
::
endl
;
return
0
;
}
else
{
std
::
cout
<<
"test GEMM+Reduce fp16: Fail"
<<
std
::
endl
;
return
-
1
;
}
}
test/gemm_split_k/gemm_split_k.cpp
View file @
dd6a8de4
...
...
@@ -12,7 +12,7 @@
#include "tensor_layout.hpp"
#include "device_gemm_xdl_splitk.hpp"
enum
GemmMatrixLayout
enum
struct
GemmMatrixLayout
{
MK_KN_MN
,
// 0
MK_NK_MN
,
// 1
...
...
@@ -59,7 +59,7 @@ static bool check_out(const Tensor<T>& ref, const Tensor<T>& result)
struct
gemmArgs
{
in
t
layout
;
GemmMatrixLayou
t
layout
;
int
M
;
int
N
;
int
K
;
...
...
@@ -120,7 +120,7 @@ int test_gemm(const gemmArgs& args)
f_host_tensor_descriptor
(
args
.
M
,
args
.
N
,
args
.
StrideC
,
c_row_major
));
// init data
std
::
size_t
num_thread
=
std
::
thread
::
hardware_concurrency
()
;
std
::
size_t
num_thread
=
1
;
a_m_k
.
GenerateTensorValue
(
GeneratorTensor_2
<
float
>
{
-
5
,
5
},
num_thread
);
b_k_n
.
GenerateTensorValue
(
GeneratorTensor_2
<
float
>
{
-
5
,
5
},
num_thread
);
// set zero to c_device_buf
...
...
@@ -216,13 +216,13 @@ int main(int argc, char* argv[])
std
::
vector
<
gemmArgs
>
test_cases
;
if
(
argc
==
1
)
{
test_cases
=
{{
0
,
3
,
3
,
3
,
3
,
3
,
3
,
1
}};
test_cases
=
{{
GemmMatrixLayout
::
MK_KN_MN
,
3
,
3
,
3
,
3
,
3
,
3
,
1
}};
// JD: Populate with more and meaningful
return
0
;
}
else
if
(
argc
==
9
)
{
const
int
layout
=
static_cast
<
GemmMatrixLayout
>
(
std
::
stoi
(
argv
[
1
]));
const
auto
layout
=
static_cast
<
GemmMatrixLayout
>
(
std
::
stoi
(
argv
[
1
]));
const
int
M
=
std
::
stoi
(
argv
[
2
]);
const
int
N
=
std
::
stoi
(
argv
[
3
]);
...
...
Prev
1
…
19
20
21
22
23
24
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment