Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
b78c8719
"docs/source/en/api/pipelines/kandinsky.md" did not exist on "f19f1287358beb31a71bc1bf0ef680a2c6155964"
Commit
b78c8719
authored
Jul 17, 2022
by
Jing Zhang
Browse files
Merge remote-tracking branch 'origin/develop' into grouped_gemm_multiD
parents
2113ce2e
a11680cc
Changes
26
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
273 additions
and
26 deletions
+273
-26
script/run_performance_tests.sh
script/run_performance_tests.sh
+28
-26
test/CMakeLists.txt
test/CMakeLists.txt
+1
-0
test/layernorm/CMakeLists.txt
test/layernorm/CMakeLists.txt
+8
-0
test/layernorm/test_layernorm_fp16.cpp
test/layernorm/test_layernorm_fp16.cpp
+29
-0
test/layernorm/test_layernorm_fp32.cpp
test/layernorm/test_layernorm_fp32.cpp
+29
-0
test/layernorm/test_layernorm_util.hpp
test/layernorm/test_layernorm_util.hpp
+178
-0
No files found.
script/run_performance_tests.sh
View file @
b78c8719
...
@@ -10,17 +10,27 @@ pip3 install sqlalchemy pymysql pandas sshtunnel
...
@@ -10,17 +10,27 @@ pip3 install sqlalchemy pymysql pandas sshtunnel
# post your new test results to the database and compare them to the baseline
# post your new test results to the database and compare them to the baseline
# please contact Illia.Silin@amd.com for more details
# please contact Illia.Silin@amd.com for more details
#
#
# run the script as "./run_performance_tests.sh <tag for your test environment>
#get the test environment type:
export
env_type
=
$1
echo
'Environment type '
$env_type
function
print_log_header
(){
rm
-f
$1
;
git status |
grep
-e
'On branch'
>
$1
;
echo
-n
'Node name: '
>>
$1
;
hostname
>>
$1
;
#get GPU_arch and number of compute units from rocminfo
echo
-n
"GPU_arch: "
>>
$1
;
rocminfo |
grep
"Name:"
|
grep
"gfx"
>>
$1
;
rocminfo |
grep
"Compute Unit:"
>>
$1
;
hipcc
--version
|
grep
-e
'HIP version'
>>
$1
;
echo
'Environment type: '
$2
>>
$1
;
/opt/rocm/bin/amdclang++
--version
|
grep
-e
'InstalledDir'
>>
$1
;
}
#run gemm tests
export
gemm_log
=
"perf_gemm.log"
export
gemm_log
=
"perf_gemm.log"
rm
-f
$gemm_log
print_log_header
$gemm_log
$env_type
git status |
grep
-e
'On branch'
>
${
gemm_log
}
./profile_gemm.sh gemm 0 0 0 1 0 5 |
tee
-a
$gemm_log
echo
-n
'Node name: '
>>
${
gemm_log
}
;
hostname
>>
${
gemm_log
}
#get GPU_arch and number of compute units from rocminfo
echo
-n
"GPU_arch: "
>>
${
gemm_log
}
;
rocminfo |
grep
"Name:"
|
grep
"gfx"
>>
${
gemm_log
}
rocminfo |
grep
"Compute Unit:"
>>
${
gemm_log
}
hipcc
--version
|
grep
-e
'HIP version'
>>
${
gemm_log
}
/opt/rocm/bin/amdclang++
--version
|
grep
-e
'InstalledDir'
>>
${
gemm_log
}
./profile_gemm.sh gemm 0 0 0 1 0 5 |
tee
-a
${
gemm_log
}
./profile_gemm.sh gemm 1 0 0 1 0 5 |
tee
-a
$gemm_log
./profile_gemm.sh gemm 1 0 0 1 0 5 |
tee
-a
$gemm_log
./profile_gemm.sh gemm 2 0 0 1 0 5 |
tee
-a
$gemm_log
./profile_gemm.sh gemm 2 0 0 1 0 5 |
tee
-a
$gemm_log
./profile_gemm.sh gemm 3 0 0 1 0 5 |
tee
-a
$gemm_log
./profile_gemm.sh gemm 3 0 0 1 0 5 |
tee
-a
$gemm_log
...
@@ -36,22 +46,14 @@ hipcc --version | grep -e 'HIP version' >> ${gemm_log}
...
@@ -36,22 +46,14 @@ hipcc --version | grep -e 'HIP version' >> ${gemm_log}
./profile_gemm.sh gemm 1 3 0 1 0 5 |
tee
-a
$gemm_log
./profile_gemm.sh gemm 1 3 0 1 0 5 |
tee
-a
$gemm_log
./profile_gemm.sh gemm 2 3 0 1 0 5 |
tee
-a
$gemm_log
./profile_gemm.sh gemm 2 3 0 1 0 5 |
tee
-a
$gemm_log
./profile_gemm.sh gemm 3 3 0 1 0 5 |
tee
-a
$gemm_log
./profile_gemm.sh gemm 3 3 0 1 0 5 |
tee
-a
$gemm_log
python3 process_perf_data.py
$gemm_log
python3 parse_perf_data.py
${
gemm_log
}
#run resnet50 test
#run resnet50 test
export
resnet_log
=
"perf_resnet50.log"
export
resnet256_log
=
"perf_resnet50_N256.log"
rm
-f
$resnet_log
print_log_header
$resnet256_log
$env_type
git status |
grep
-e
'On branch'
>
${
resnet_log
}
./profile_resnet50.sh conv_fwd_bias_relu 1 1 1 1 0 2 0 1 256 |
tee
-a
$resnet256_log
echo
-n
'Node name: '
>>
${
resnet_log
}
;
hostname
>>
${
resnet_log
}
python3 process_perf_data.py
$resnet256_log
#get GPU_arch and number of compute units from rocminfo
export
resnet4_log
=
"perf_resnet50_N4.log"
echo
-n
"GPU_arch: "
>>
${
resnet_log
}
;
rocminfo |
grep
"Name:"
|
grep
"gfx"
>>
${
resnet_log
}
print_log_header
$resnet4_log
$env_type
rocminfo |
grep
"Compute Unit:"
>>
${
resnet_log
}
./profile_resnet50.sh conv_fwd_bias_relu 1 1 1 1 0 2 0 1 4 |
tee
-a
$resnet4_log
hipcc
--version
|
grep
-e
'HIP version'
>>
${
resnet_log
}
python3 process_perf_data.py
$resnet4_log
/opt/rocm/bin/amdclang++
--version
|
grep
-e
'InstalledDir'
>>
${
resnet_log
}
#first run tests with N=256
./profile_conv.sh conv_fwd_bias_relu 1 1 1 1 0 2 0 1 256 |
tee
-a
${
resnet_log
}
#then run with N=4
./profile_conv.sh conv_fwd_bias_relu 1 1 1 1 0 2 0 1 4 |
tee
-a
${
resnet_log
}
#the script will put the results from N=256 and N=4 runs into separate tables
python3 parse_perf_data.py
${
resnet_log
}
test/CMakeLists.txt
View file @
b78c8719
...
@@ -48,3 +48,4 @@ add_subdirectory(convnd_bwd_weight)
...
@@ -48,3 +48,4 @@ add_subdirectory(convnd_bwd_weight)
add_subdirectory
(
convnd_bwd_data
)
add_subdirectory
(
convnd_bwd_data
)
add_subdirectory
(
block_to_ctile_map
)
add_subdirectory
(
block_to_ctile_map
)
add_subdirectory
(
softmax
)
add_subdirectory
(
softmax
)
add_subdirectory
(
layernorm
)
test/layernorm/CMakeLists.txt
0 → 100644
View file @
b78c8719
add_custom_target
(
test_layernorm
)
add_gtest_executable
(
test_layernorm_fp32 test_layernorm_fp32.cpp
)
add_gtest_executable
(
test_layernorm_fp16 test_layernorm_fp16.cpp
)
target_link_libraries
(
test_layernorm_fp32 PRIVATE host_tensor
)
target_link_libraries
(
test_layernorm_fp16 PRIVATE host_tensor
)
add_dependencies
(
test_layernorm test_layernorm_fp32
)
add_dependencies
(
test_layernorm test_layernorm_fp16
)
\ No newline at end of file
test/layernorm/test_layernorm_fp16.cpp
0 → 100644
View file @
b78c8719
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "gtest/gtest.h"
#include "test_layernorm_util.hpp"
template
<
ck
::
index_t
N
>
using
I
=
ck
::
Number
<
N
>
;
template
<
typename
Tuple
>
class
TestLayernormFP16
:
public
ck
::
TestLayernorm
<
Tuple
>
{
};
// clang-format off
using
KernelTypes
=
::
testing
::
Types
<
// XDataType, GammaDataType, BetaDataType, AccDataType, YDataType, Rank, NumReduceDim, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, XYSrcVectorDim, XSrcVectorSize, , GammaSrcVectorSize, BetaSrcVectorSize, YDstVectorSize>
std
::
tuple
<
ck
::
half_t
,
ck
::
half_t
,
ck
::
half_t
,
float
,
ck
::
half_t
,
I
<
2
>
,
I
<
1
>
,
I
<
256
>
,
I
<
8
>
,
I
<
32
>
,
I
<
1
>
,
I
<
8
>
,
I
<
1
>
,
I
<
8
>
,
I
<
8
>
,
I
<
8
>
,
I
<
8
>>
,
std
::
tuple
<
ck
::
half_t
,
ck
::
half_t
,
ck
::
half_t
,
float
,
ck
::
half_t
,
I
<
2
>
,
I
<
1
>
,
I
<
256
>
,
I
<
8
>
,
I
<
32
>
,
I
<
2
>
,
I
<
8
>
,
I
<
1
>
,
I
<
8
>
,
I
<
8
>
,
I
<
8
>
,
I
<
8
>>
,
std
::
tuple
<
ck
::
half_t
,
ck
::
half_t
,
ck
::
half_t
,
float
,
ck
::
half_t
,
I
<
2
>
,
I
<
1
>
,
I
<
256
>
,
I
<
4
>
,
I
<
64
>
,
I
<
1
>
,
I
<
8
>
,
I
<
1
>
,
I
<
8
>
,
I
<
8
>
,
I
<
8
>
,
I
<
8
>>
,
std
::
tuple
<
ck
::
half_t
,
ck
::
half_t
,
ck
::
half_t
,
float
,
ck
::
half_t
,
I
<
2
>
,
I
<
1
>
,
I
<
256
>
,
I
<
4
>
,
I
<
64
>
,
I
<
2
>
,
I
<
8
>
,
I
<
1
>
,
I
<
8
>
,
I
<
8
>
,
I
<
8
>
,
I
<
8
>>
,
std
::
tuple
<
ck
::
half_t
,
ck
::
half_t
,
ck
::
half_t
,
float
,
ck
::
half_t
,
I
<
2
>
,
I
<
1
>
,
I
<
256
>
,
I
<
2
>
,
I
<
128
>
,
I
<
1
>
,
I
<
8
>
,
I
<
1
>
,
I
<
8
>
,
I
<
8
>
,
I
<
8
>
,
I
<
8
>>
,
std
::
tuple
<
ck
::
half_t
,
ck
::
half_t
,
ck
::
half_t
,
float
,
ck
::
half_t
,
I
<
2
>
,
I
<
1
>
,
I
<
256
>
,
I
<
2
>
,
I
<
128
>
,
I
<
2
>
,
I
<
8
>
,
I
<
1
>
,
I
<
8
>
,
I
<
8
>
,
I
<
8
>
,
I
<
8
>>
,
std
::
tuple
<
ck
::
half_t
,
ck
::
half_t
,
ck
::
half_t
,
float
,
ck
::
half_t
,
I
<
2
>
,
I
<
1
>
,
I
<
256
>
,
I
<
1
>
,
I
<
256
>
,
I
<
1
>
,
I
<
8
>
,
I
<
1
>
,
I
<
8
>
,
I
<
8
>
,
I
<
8
>
,
I
<
8
>>
,
std
::
tuple
<
ck
::
half_t
,
ck
::
half_t
,
ck
::
half_t
,
float
,
ck
::
half_t
,
I
<
2
>
,
I
<
1
>
,
I
<
256
>
,
I
<
1
>
,
I
<
256
>
,
I
<
2
>
,
I
<
8
>
,
I
<
1
>
,
I
<
8
>
,
I
<
8
>
,
I
<
8
>
,
I
<
8
>>
>
;
// clang-format on
TYPED_TEST_SUITE
(
TestLayernormFP16
,
KernelTypes
);
TYPED_TEST
(
TestLayernormFP16
,
Test_FP16
)
{
this
->
Run
();
}
test/layernorm/test_layernorm_fp32.cpp
0 → 100644
View file @
b78c8719
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "gtest/gtest.h"
#include "test_layernorm_util.hpp"
template
<
ck
::
index_t
N
>
using
I
=
ck
::
Number
<
N
>
;
template
<
typename
Tuple
>
class
TestLayernormFP32
:
public
ck
::
TestLayernorm
<
Tuple
>
{
};
// clang-format off
using
KernelTypes
=
::
testing
::
Types
<
// XDataType, GammaDataType, BetaDataType, AccDataType, YDataType, Rank, NumReduceDim, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, XYSrcVectorDim, XSrcVectorSize, , GammaSrcVectorSize, BetaSrcVectorSize, YDstVectorSize>
std
::
tuple
<
float
,
float
,
float
,
float
,
float
,
I
<
2
>
,
I
<
1
>
,
I
<
256
>
,
I
<
8
>
,
I
<
32
>
,
I
<
1
>
,
I
<
8
>
,
I
<
1
>
,
I
<
4
>
,
I
<
4
>
,
I
<
4
>
,
I
<
4
>>
,
std
::
tuple
<
float
,
float
,
float
,
float
,
float
,
I
<
2
>
,
I
<
1
>
,
I
<
256
>
,
I
<
8
>
,
I
<
32
>
,
I
<
2
>
,
I
<
8
>
,
I
<
1
>
,
I
<
4
>
,
I
<
4
>
,
I
<
4
>
,
I
<
4
>>
,
std
::
tuple
<
float
,
float
,
float
,
float
,
float
,
I
<
2
>
,
I
<
1
>
,
I
<
256
>
,
I
<
4
>
,
I
<
64
>
,
I
<
1
>
,
I
<
8
>
,
I
<
1
>
,
I
<
4
>
,
I
<
4
>
,
I
<
4
>
,
I
<
4
>>
,
std
::
tuple
<
float
,
float
,
float
,
float
,
float
,
I
<
2
>
,
I
<
1
>
,
I
<
256
>
,
I
<
4
>
,
I
<
64
>
,
I
<
2
>
,
I
<
8
>
,
I
<
1
>
,
I
<
4
>
,
I
<
4
>
,
I
<
4
>
,
I
<
4
>>
,
std
::
tuple
<
float
,
float
,
float
,
float
,
float
,
I
<
2
>
,
I
<
1
>
,
I
<
256
>
,
I
<
2
>
,
I
<
128
>
,
I
<
1
>
,
I
<
8
>
,
I
<
1
>
,
I
<
4
>
,
I
<
4
>
,
I
<
4
>
,
I
<
4
>>
,
std
::
tuple
<
float
,
float
,
float
,
float
,
float
,
I
<
2
>
,
I
<
1
>
,
I
<
256
>
,
I
<
2
>
,
I
<
128
>
,
I
<
2
>
,
I
<
8
>
,
I
<
1
>
,
I
<
4
>
,
I
<
4
>
,
I
<
4
>
,
I
<
4
>>
,
std
::
tuple
<
float
,
float
,
float
,
float
,
float
,
I
<
2
>
,
I
<
1
>
,
I
<
256
>
,
I
<
1
>
,
I
<
256
>
,
I
<
1
>
,
I
<
8
>
,
I
<
1
>
,
I
<
4
>
,
I
<
4
>
,
I
<
4
>
,
I
<
4
>>
,
std
::
tuple
<
float
,
float
,
float
,
float
,
float
,
I
<
2
>
,
I
<
1
>
,
I
<
256
>
,
I
<
1
>
,
I
<
256
>
,
I
<
2
>
,
I
<
8
>
,
I
<
1
>
,
I
<
4
>
,
I
<
4
>
,
I
<
4
>
,
I
<
4
>>
>
;
// clang-format on
TYPED_TEST_SUITE
(
TestLayernormFP32
,
KernelTypes
);
TYPED_TEST
(
TestLayernormFP32
,
Test_FP32
)
{
this
->
Run
();
}
test/layernorm/test_layernorm_util.hpp
0 → 100644
View file @
b78c8719
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <vector>
#include <iostream>
#include <gtest/gtest.h>
#include "ck/ck.hpp"
#include "ck/utility/number.hpp"
#include "ck/tensor_operation/gpu/device/device_layernorm.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/host_tensor/host_tensor.hpp"
#include "ck/library/host_tensor/device_memory.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_layernorm.hpp"
namespace
ck
{
template
<
typename
Range
>
std
::
string
serialize_range
(
const
Range
&
range
)
{
std
::
stringstream
ss
;
for
(
auto
&
r
:
range
)
{
ss
<<
r
<<
", "
;
}
std
::
string
str
=
ss
.
str
();
return
std
::
string
(
str
.
begin
(),
str
.
end
()
-
2
);
}
template
<
typename
Tuple
>
class
TestLayernorm
:
public
::
testing
::
Test
{
protected:
using
XDataType
=
std
::
tuple_element_t
<
0
,
Tuple
>
;
using
GammaDataType
=
std
::
tuple_element_t
<
1
,
Tuple
>
;
using
BetaDataType
=
std
::
tuple_element_t
<
2
,
Tuple
>
;
using
AccDataType
=
std
::
tuple_element_t
<
3
,
Tuple
>
;
using
YDataType
=
std
::
tuple_element_t
<
4
,
Tuple
>
;
static
constexpr
index_t
Rank
=
std
::
tuple_element_t
<
5
,
Tuple
>
{}.
value
;
static
constexpr
index_t
NumReduceDim
=
std
::
tuple_element_t
<
6
,
Tuple
>
{}.
value
;
static
constexpr
index_t
BlockSize
=
std
::
tuple_element_t
<
7
,
Tuple
>
{}.
value
;
static
constexpr
index_t
MThreadClusterSize
=
std
::
tuple_element_t
<
8
,
Tuple
>
{}.
value
;
static
constexpr
index_t
KThreadClusterSize
=
std
::
tuple_element_t
<
9
,
Tuple
>
{}.
value
;
static
constexpr
index_t
MThreadSliceSize
=
std
::
tuple_element_t
<
10
,
Tuple
>
{}.
value
;
static
constexpr
index_t
KThreadSliceSize
=
std
::
tuple_element_t
<
11
,
Tuple
>
{}.
value
;
static
constexpr
index_t
XYSrcVectorDim
=
std
::
tuple_element_t
<
12
,
Tuple
>
{}.
value
;
static
constexpr
index_t
XSrcVectorSize
=
std
::
tuple_element_t
<
13
,
Tuple
>
{}.
value
;
static
constexpr
index_t
GammaSrcVectorSize
=
std
::
tuple_element_t
<
14
,
Tuple
>
{}.
value
;
static
constexpr
index_t
BetaSrcVectorSize
=
std
::
tuple_element_t
<
15
,
Tuple
>
{}.
value
;
static
constexpr
index_t
YDstVectorSize
=
std
::
tuple_element_t
<
16
,
Tuple
>
{}.
value
;
using
PassThrough
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
ReferenceInstance
=
tensor_operation
::
host
::
ReferenceLayernorm
<
XDataType
,
GammaDataType
,
BetaDataType
,
YDataType
,
AccDataType
,
PassThrough
,
Rank
,
NumReduceDim
>
;
using
DeviceInstance
=
tensor_operation
::
device
::
DeviceLayernorm
<
XDataType
,
GammaDataType
,
BetaDataType
,
AccDataType
,
YDataType
,
PassThrough
,
Rank
,
NumReduceDim
,
BlockSize
,
MThreadClusterSize
,
KThreadClusterSize
,
MThreadSliceSize
,
KThreadSliceSize
,
XYSrcVectorDim
,
XSrcVectorSize
,
GammaSrcVectorSize
,
BetaSrcVectorSize
,
YDstVectorSize
>
;
TestLayernorm
()
:
ref_instance_invoker_
(
ReferenceInstance
{}.
MakeInvoker
())
{}
void
RunSingle
(
std
::
vector
<
index_t
>
lengths
,
std
::
vector
<
index_t
>
reduceDims
)
{
std
::
vector
<
index_t
>
reduceLength
(
reduceDims
.
size
());
for
(
int
i
=
0
;
i
<
NumReduceDim
;
++
i
)
{
reduceLength
[
i
]
=
lengths
[
reduceDims
[
i
]];
}
Tensor
<
XDataType
>
x
(
lengths
);
Tensor
<
GammaDataType
>
gamma
(
reduceLength
);
Tensor
<
BetaDataType
>
beta
(
reduceLength
);
Tensor
<
YDataType
>
y
(
lengths
);
Tensor
<
YDataType
>
y_ref
(
lengths
);
x
.
GenerateTensorValue
(
GeneratorTensor_3
<
XDataType
>
{
0
,
1.0
});
gamma
.
GenerateTensorValue
(
GeneratorTensor_3
<
GammaDataType
>
{
0.0
,
1.0
});
beta
.
GenerateTensorValue
(
GeneratorTensor_3
<
BetaDataType
>
{
0.0
,
1.0
});
DeviceMem
x_dev
(
sizeof
(
XDataType
)
*
x
.
mDesc
.
GetElementSpace
());
DeviceMem
gamma_dev
(
sizeof
(
GammaDataType
)
*
gamma
.
mDesc
.
GetElementSpace
());
DeviceMem
beta_dev
(
sizeof
(
BetaDataType
)
*
beta
.
mDesc
.
GetElementSpace
());
DeviceMem
y_dev
(
sizeof
(
YDataType
)
*
y
.
mDesc
.
GetElementSpace
());
x_dev
.
ToDevice
(
x
.
mData
.
data
());
gamma_dev
.
ToDevice
(
gamma
.
mData
.
data
());
beta_dev
.
ToDevice
(
beta
.
mData
.
data
());
auto
device_instance
=
DeviceInstance
{};
auto
argument_ptr
=
device_instance
.
MakeArgumentPointer
(
lengths
,
std
::
vector
<
ck
::
index_t
>
{
x
.
mDesc
.
GetStrides
().
begin
(),
x
.
mDesc
.
GetStrides
().
end
()},
std
::
vector
<
ck
::
index_t
>
{
gamma
.
mDesc
.
GetStrides
().
begin
(),
gamma
.
mDesc
.
GetStrides
().
end
()},
std
::
vector
<
ck
::
index_t
>
{
beta
.
mDesc
.
GetStrides
().
begin
(),
beta
.
mDesc
.
GetStrides
().
end
()},
reduceDims
,
1e-4
,
x_dev
.
GetDeviceBuffer
(),
gamma_dev
.
GetDeviceBuffer
(),
beta_dev
.
GetDeviceBuffer
(),
y_dev
.
GetDeviceBuffer
(),
PassThrough
{});
if
(
!
device_instance
.
IsSupportedArgument
(
argument_ptr
.
get
()))
{
return
;
}
auto
invoker_ptr
=
device_instance
.
MakeInvokerPointer
();
invoker_ptr
->
Run
(
argument_ptr
.
get
());
ref_instance_invoker_
.
Run
(
{
x
,
gamma
,
beta
,
y_ref
,
PassThrough
{},
lengths
,
reduceDims
,
1e-4
});
y_dev
.
FromDevice
(
y
.
mData
.
data
());
bool
pass
;
if
(
std
::
is_same
<
XDataType
,
int8_t
>::
value
)
{
EXPECT_TRUE
(
pass
=
ck
::
utils
::
check_err
(
y
.
mData
,
y_ref
.
mData
,
"Error: Incorrect results!"
,
0
,
1
));
}
else
{
EXPECT_TRUE
(
pass
=
ck
::
utils
::
check_err
(
y
.
mData
,
y_ref
.
mData
,
"Error: Incorrect results d1"
,
1e-3
,
1e-3
));
}
if
(
!
pass
)
{
FAIL
()
<<
"Failure in input lengths = ["
<<
serialize_range
(
lengths
)
<<
"], "
<<
"reduce dim = ["
<<
serialize_range
(
reduceDims
)
<<
"]."
;
}
}
void
Run
()
{
for
(
auto
length
:
this
->
lengths_
)
{
this
->
RunSingle
(
length
,
reduceDims_
[
0
]);
}
}
std
::
vector
<
std
::
vector
<
index_t
>>
lengths_
=
{
{
4
,
256
},
{
8
,
511
},
{
9
,
1032
},
{
4
,
2048
},
{
1
,
8192
},
{
4000
,
2000
}};
std
::
vector
<
std
::
vector
<
index_t
>>
reduceDims_
=
{{
1
}};
typename
ReferenceInstance
::
Invoker
ref_instance_invoker_
;
};
}
// namespace ck
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment