Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
e72c0c43
Commit
e72c0c43
authored
Mar 26, 2022
by
carlushuang
Browse files
Merge remote-tracking branch 'origin/develop' into cpu_avx2
parents
d714fa15
313bbea5
Changes
262
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
1015 additions
and
398 deletions
+1015
-398
profiler/src/profile_gemm_bias_relu_add.cpp
profiler/src/profile_gemm_bias_relu_add.cpp
+4
-4
profiler/src/profile_gemm_reduce.cpp
profiler/src/profile_gemm_reduce.cpp
+147
-0
profiler/src/profile_grouped_gemm.cpp
profiler/src/profile_grouped_gemm.cpp
+157
-0
profiler/src/profile_reduce.cpp
profiler/src/profile_reduce.cpp
+90
-12
profiler/src/profiler.cpp
profiler/src/profiler.cpp
+14
-2
script/cmake-rocm.sh
script/cmake-rocm.sh
+2
-2
script/profile_reduce_no_index.sh
script/profile_reduce_no_index.sh
+12
-3
script/profile_reduce_with_index.sh
script/profile_reduce_with_index.sh
+3
-0
script/test_convnd_fwd.sh
script/test_convnd_fwd.sh
+110
-0
script/test_reduce_no_index.sh
script/test_reduce_no_index.sh
+52
-0
script/test_reduce_with_index.sh
script/test_reduce_with_index.sh
+52
-0
test/CMakeLists.txt
test/CMakeLists.txt
+5
-1
test/batched_gemm/CMakeLists.txt
test/batched_gemm/CMakeLists.txt
+4
-0
test/batched_gemm/batched_gemm_fp16.cpp
test/batched_gemm/batched_gemm_fp16.cpp
+139
-0
test/batched_gemm/batched_gemm_util.hpp
test/batched_gemm/batched_gemm_util.hpp
+106
-0
test/conv2d_bwd_data/conv2d_bwd_data.cpp
test/conv2d_bwd_data/conv2d_bwd_data.cpp
+5
-5
test/conv2d_fwd/CMakeLists.txt
test/conv2d_fwd/CMakeLists.txt
+0
-3
test/conv2d_fwd/conv2d_fwd.cpp
test/conv2d_fwd/conv2d_fwd.cpp
+0
-308
test/conv_util/conv_util.cpp
test/conv_util/conv_util.cpp
+96
-56
test/convnd_fwd/CMakeLists.txt
test/convnd_fwd/CMakeLists.txt
+17
-2
No files found.
profiler/src/profile_gemm_bias_relu_add.cpp
View file @
e72c0c43
...
@@ -6,7 +6,7 @@
...
@@ -6,7 +6,7 @@
#include <half.hpp>
#include <half.hpp>
#include "profile_gemm_bias_relu_add_impl.hpp"
#include "profile_gemm_bias_relu_add_impl.hpp"
enum
GemmMatrixLayout
enum
struct
GemmMatrixLayout
{
{
MK_KN_MN
,
// 0
MK_KN_MN
,
// 0
MK_NK_MN
,
// 1
MK_NK_MN
,
// 1
...
@@ -18,7 +18,7 @@ enum GemmMatrixLayout
...
@@ -18,7 +18,7 @@ enum GemmMatrixLayout
KM_NK_NM
,
// 7
KM_NK_NM
,
// 7
};
};
enum
GemmDataType
enum
struct
GemmDataType
{
{
F32_F32_F32
,
// 0
F32_F32_F32
,
// 0
F16_F16_F16
,
// 1
F16_F16_F16
,
// 1
...
@@ -43,8 +43,8 @@ int profile_gemm_bias_relu_add(int argc, char* argv[])
...
@@ -43,8 +43,8 @@ int profile_gemm_bias_relu_add(int argc, char* argv[])
exit
(
1
);
exit
(
1
);
}
}
const
int
data_type
=
static_cast
<
GemmDataType
>
(
std
::
stoi
(
argv
[
2
]));
const
auto
data_type
=
static_cast
<
GemmDataType
>
(
std
::
stoi
(
argv
[
2
]));
const
int
layout
=
static_cast
<
GemmMatrixLayout
>
(
std
::
stoi
(
argv
[
3
]));
const
auto
layout
=
static_cast
<
GemmMatrixLayout
>
(
std
::
stoi
(
argv
[
3
]));
const
bool
do_verification
=
std
::
stoi
(
argv
[
4
]);
const
bool
do_verification
=
std
::
stoi
(
argv
[
4
]);
const
int
init_method
=
std
::
stoi
(
argv
[
5
]);
const
int
init_method
=
std
::
stoi
(
argv
[
5
]);
const
bool
do_log
=
std
::
stoi
(
argv
[
6
]);
const
bool
do_log
=
std
::
stoi
(
argv
[
6
]);
...
...
profiler/src/profile_gemm_reduce.cpp
0 → 100644
View file @
e72c0c43
#include <iostream>
#include <numeric>
#include <initializer_list>
#include <cstdlib>
#include <stdlib.h>
#include <half.hpp>
#include "profile_gemm_reduce_impl.hpp"
int
profile_gemm_reduce
(
int
argc
,
char
*
argv
[])
{
enum
struct
GemmMatrixLayout_t
{
MK_KN_MN
,
// 0
MK_NK_MN
,
// 1
KM_KN_MN
,
// 2
KM_NK_MN
,
// 3
};
enum
struct
GemmReduceDataType_t
{
F32_F32_F32_F32_F32
,
// 0
F16_F16_F16_F32_F32
,
// 1
};
if
(
!
(
argc
==
14
||
argc
==
15
))
{
printf
(
"arg1: tensor operation (gemm: GEMM+Reduce)
\n
"
);
printf
(
"arg2: data type (0: fp32; 1: fp16)
\n
"
);
printf
(
"arg3: matrix layout (0: A[m, k] * B[k, n] = C[m, n];
\n
"
);
printf
(
" 1: A[m, k] * B[n, k] = C[m, n];
\n
"
);
printf
(
" 2: A[k, m] * B[k, n] = C[m, n];
\n
"
);
printf
(
" 3: A[k, m] * B[n, k] = C[m, n])
\n
"
);
printf
(
"arg4: verification (0: no; 1: yes)
\n
"
);
printf
(
"arg5: initialization (0: no init; 1: integer value; 2: decimal value)
\n
"
);
printf
(
"arg8: print tensor value (0: no; 1: yes)
\n
"
);
printf
(
"arg7: run kernel # of times (>1)
\n
"
);
printf
(
"arg8 to 13: M, N, K, StrideA, StrideB, StrideC
\n
"
);
printf
(
"arg14: split k into mulitiple batch
\n
"
);
exit
(
1
);
}
const
auto
data_type
=
static_cast
<
GemmReduceDataType_t
>
(
std
::
stoi
(
argv
[
2
]));
const
auto
layout
=
static_cast
<
GemmMatrixLayout_t
>
(
std
::
stoi
(
argv
[
3
]));
const
bool
do_verification
=
std
::
stoi
(
argv
[
4
]);
const
int
init_method
=
std
::
stoi
(
argv
[
5
]);
const
bool
do_log
=
std
::
stoi
(
argv
[
6
]);
const
int
nrepeat
=
std
::
stoi
(
argv
[
7
]);
const
int
M
=
std
::
stoi
(
argv
[
8
]);
const
int
N
=
std
::
stoi
(
argv
[
9
]);
const
int
K
=
std
::
stoi
(
argv
[
10
]);
const
int
StrideA
=
std
::
stoi
(
argv
[
11
]);
const
int
StrideB
=
std
::
stoi
(
argv
[
12
]);
const
int
StrideC
=
std
::
stoi
(
argv
[
13
]);
if
(
data_type
==
GemmReduceDataType_t
::
F16_F16_F16_F32_F32
&&
layout
==
GemmMatrixLayout_t
::
MK_KN_MN
)
{
ck
::
profiler
::
profile_gemm_reduce_impl
<
ck
::
half_t
,
ck
::
half_t
,
ck
::
half_t
,
float
,
ck
::
tensor_layout
::
gemm
::
RowMajor
,
ck
::
tensor_layout
::
gemm
::
RowMajor
,
ck
::
tensor_layout
::
gemm
::
RowMajor
>
(
do_verification
,
init_method
,
do_log
,
nrepeat
,
M
,
N
,
K
,
(
StrideA
<
0
)
?
K
:
StrideA
,
(
StrideB
<
0
)
?
N
:
StrideB
,
(
StrideC
<
0
)
?
N
:
StrideC
);
}
else
if
(
data_type
==
GemmReduceDataType_t
::
F16_F16_F16_F32_F32
&&
layout
==
GemmMatrixLayout_t
::
MK_NK_MN
)
{
ck
::
profiler
::
profile_gemm_reduce_impl
<
ck
::
half_t
,
ck
::
half_t
,
ck
::
half_t
,
float
,
ck
::
tensor_layout
::
gemm
::
RowMajor
,
ck
::
tensor_layout
::
gemm
::
ColumnMajor
,
ck
::
tensor_layout
::
gemm
::
RowMajor
>
(
do_verification
,
init_method
,
do_log
,
nrepeat
,
M
,
N
,
K
,
(
StrideA
<
0
)
?
K
:
StrideA
,
(
StrideB
<
0
)
?
K
:
StrideB
,
(
StrideC
<
0
)
?
N
:
StrideC
);
}
else
if
(
data_type
==
GemmReduceDataType_t
::
F16_F16_F16_F32_F32
&&
layout
==
GemmMatrixLayout_t
::
KM_KN_MN
)
{
ck
::
profiler
::
profile_gemm_reduce_impl
<
ck
::
half_t
,
ck
::
half_t
,
ck
::
half_t
,
float
,
ck
::
tensor_layout
::
gemm
::
ColumnMajor
,
ck
::
tensor_layout
::
gemm
::
RowMajor
,
ck
::
tensor_layout
::
gemm
::
RowMajor
>
(
do_verification
,
init_method
,
do_log
,
nrepeat
,
M
,
N
,
K
,
(
StrideA
<
0
)
?
M
:
StrideA
,
(
StrideB
<
0
)
?
N
:
StrideB
,
(
StrideC
<
0
)
?
N
:
StrideC
);
}
else
if
(
data_type
==
GemmReduceDataType_t
::
F16_F16_F16_F32_F32
&&
layout
==
GemmMatrixLayout_t
::
KM_NK_MN
)
{
ck
::
profiler
::
profile_gemm_reduce_impl
<
ck
::
half_t
,
ck
::
half_t
,
ck
::
half_t
,
float
,
ck
::
tensor_layout
::
gemm
::
ColumnMajor
,
ck
::
tensor_layout
::
gemm
::
ColumnMajor
,
ck
::
tensor_layout
::
gemm
::
RowMajor
>
(
do_verification
,
init_method
,
do_log
,
nrepeat
,
M
,
N
,
K
,
(
StrideA
<
0
)
?
M
:
StrideA
,
(
StrideB
<
0
)
?
K
:
StrideB
,
(
StrideC
<
0
)
?
N
:
StrideC
);
}
else
{
throw
std
::
runtime_error
(
"wrong! this data_type & layout is not implemented"
);
}
return
1
;
}
profiler/src/profile_grouped_gemm.cpp
0 → 100644
View file @
e72c0c43
#include <iostream>
#include <numeric>
#include <initializer_list>
#include <cstdlib>
#include <stdlib.h>
#include <half.hpp>
#include "profile_grouped_gemm_impl.hpp"
enum
GemmMatrixLayout
{
MK_KN_MN
,
// 0
MK_NK_MN
,
// 1
KM_KN_MN
,
// 2
KM_NK_MN
,
// 3
MK_KN_NM
,
// 4
MK_NK_NM
,
// 5
KM_KN_NM
,
// 6
KM_NK_NM
,
// 7
};
enum
GemmDataType
{
F32_F32_F32
,
// 0
F16_F16_F16
,
// 1
BF16_BF16_BF16
,
// 2
INT8_INT8_INT8
,
// 3
};
std
::
vector
<
int
>
argToIntArray
(
char
*
input
)
{
std
::
vector
<
int
>
out
;
std
::
istringstream
in
(
input
);
std
::
string
item
;
while
(
std
::
getline
(
in
,
item
,
','
))
{
out
.
push_back
(
std
::
stoi
(
item
));
}
return
out
;
}
int
profile_grouped_gemm
(
int
argc
,
char
*
argv
[])
{
if
(
!
(
argc
==
14
))
{
printf
(
"arg1: tensor operation (grouped_gemm: Grouped GEMM)
\n
"
);
printf
(
"arg2: data type (0: fp32; 1: fp16; 2: bf16; 3: int8)
\n
"
);
printf
(
"arg3: matrix layout (0: A[m, k] * B[k, n] = C[m, n];
\n
"
);
printf
(
" 1: A[m, k] * B[n, k] = C[m, n];
\n
"
);
printf
(
" 2: A[k, m] * B[k, n] = C[m, n];
\n
"
);
printf
(
" 3: A[k, m] * B[n, k] = C[m, n])
\n
"
);
printf
(
"arg4: verification (0: no; 1: yes)
\n
"
);
printf
(
"arg5: initialization (0: no init; 1: integer value; 2: decimal value)
\n
"
);
printf
(
"arg8: print tensor value (0: no; 1: yes)
\n
"
);
printf
(
"arg7: run kernel # of times (>1)
\n
"
);
printf
(
"arg8 to 13: Ms, Ns, Ks, StrideAs, StrideBs, StrideCs (e.g., 256,256 128,128 64,64 "
"64,64 64,64 128,128)
\n
"
);
exit
(
1
);
}
const
int
data_type
=
static_cast
<
GemmDataType
>
(
std
::
stoi
(
argv
[
2
]));
const
int
layout
=
static_cast
<
GemmMatrixLayout
>
(
std
::
stoi
(
argv
[
3
]));
const
bool
do_verification
=
std
::
stoi
(
argv
[
4
]);
const
int
init_method
=
std
::
stoi
(
argv
[
5
]);
const
bool
do_log
=
std
::
stoi
(
argv
[
6
]);
const
int
nrepeat
=
std
::
stoi
(
argv
[
7
]);
const
auto
Ms
=
argToIntArray
(
argv
[
8
]);
const
auto
Ns
=
argToIntArray
(
argv
[
9
]);
const
auto
Ks
=
argToIntArray
(
argv
[
10
]);
const
auto
StrideAs
=
argToIntArray
(
argv
[
11
]);
const
auto
StrideBs
=
argToIntArray
(
argv
[
12
]);
const
auto
StrideCs
=
argToIntArray
(
argv
[
13
]);
if
(
data_type
==
GemmDataType
::
F16_F16_F16
&&
layout
==
GemmMatrixLayout
::
MK_KN_MN
)
{
ck
::
profiler
::
profile_grouped_gemm_impl
<
ck
::
half_t
,
ck
::
half_t
,
ck
::
half_t
,
ck
::
tensor_layout
::
gemm
::
RowMajor
,
ck
::
tensor_layout
::
gemm
::
RowMajor
,
ck
::
tensor_layout
::
gemm
::
RowMajor
>
(
do_verification
,
init_method
,
do_log
,
nrepeat
,
Ms
,
Ns
,
Ks
,
StrideAs
,
StrideBs
,
StrideCs
);
}
else
if
(
data_type
==
GemmDataType
::
F16_F16_F16
&&
layout
==
GemmMatrixLayout
::
MK_NK_MN
)
{
ck
::
profiler
::
profile_grouped_gemm_impl
<
ck
::
half_t
,
ck
::
half_t
,
ck
::
half_t
,
ck
::
tensor_layout
::
gemm
::
RowMajor
,
ck
::
tensor_layout
::
gemm
::
ColumnMajor
,
ck
::
tensor_layout
::
gemm
::
RowMajor
>
(
do_verification
,
init_method
,
do_log
,
nrepeat
,
Ms
,
Ns
,
Ks
,
StrideAs
,
StrideBs
,
StrideCs
);
}
else
if
(
data_type
==
GemmDataType
::
F16_F16_F16
&&
layout
==
GemmMatrixLayout
::
KM_KN_MN
)
{
ck
::
profiler
::
profile_grouped_gemm_impl
<
ck
::
half_t
,
ck
::
half_t
,
ck
::
half_t
,
ck
::
tensor_layout
::
gemm
::
ColumnMajor
,
ck
::
tensor_layout
::
gemm
::
RowMajor
,
ck
::
tensor_layout
::
gemm
::
RowMajor
>
(
do_verification
,
init_method
,
do_log
,
nrepeat
,
Ms
,
Ns
,
Ks
,
StrideAs
,
StrideBs
,
StrideCs
);
}
else
if
(
data_type
==
GemmDataType
::
F16_F16_F16
&&
layout
==
GemmMatrixLayout
::
KM_NK_MN
)
{
ck
::
profiler
::
profile_grouped_gemm_impl
<
ck
::
half_t
,
ck
::
half_t
,
ck
::
half_t
,
ck
::
tensor_layout
::
gemm
::
ColumnMajor
,
ck
::
tensor_layout
::
gemm
::
ColumnMajor
,
ck
::
tensor_layout
::
gemm
::
RowMajor
>
(
do_verification
,
init_method
,
do_log
,
nrepeat
,
Ms
,
Ns
,
Ks
,
StrideAs
,
StrideBs
,
StrideCs
);
}
else
{
throw
std
::
runtime_error
(
"wrong! this GEMM data_type & layout is not implemented"
);
}
return
1
;
}
profiler/src/profile_reduce.cpp
View file @
e72c0c43
...
@@ -34,6 +34,8 @@ static struct option long_options[] = {{"inLengths", required_argument, nullptr,
...
@@ -34,6 +34,8 @@ static struct option long_options[] = {{"inLengths", required_argument, nullptr,
{
"scales"
,
required_argument
,
nullptr
,
'S'
},
{
"scales"
,
required_argument
,
nullptr
,
'S'
},
{
"half"
,
no_argument
,
nullptr
,
'?'
},
{
"half"
,
no_argument
,
nullptr
,
'?'
},
{
"double"
,
no_argument
,
nullptr
,
'?'
},
{
"double"
,
no_argument
,
nullptr
,
'?'
},
{
"int8"
,
no_argument
,
nullptr
,
'?'
},
{
"bf16"
,
no_argument
,
nullptr
,
'?'
},
{
"dumpout"
,
required_argument
,
nullptr
,
'o'
},
{
"dumpout"
,
required_argument
,
nullptr
,
'o'
},
{
"verify"
,
required_argument
,
nullptr
,
'v'
},
{
"verify"
,
required_argument
,
nullptr
,
'v'
},
{
"log"
,
required_argument
,
nullptr
,
'l'
},
{
"log"
,
required_argument
,
nullptr
,
'l'
},
...
@@ -82,7 +84,7 @@ static std::vector<T> getTypeValuesFromString(const char* cstr_values)
...
@@ -82,7 +84,7 @@ static std::vector<T> getTypeValuesFromString(const char* cstr_values)
return
(
values
);
return
(
values
);
}
}
typedef
enum
enum
struct
appDataType_t
{
{
appHalf
=
0
,
appHalf
=
0
,
appFloat
=
1
,
appFloat
=
1
,
...
@@ -91,7 +93,7 @@ typedef enum
...
@@ -91,7 +93,7 @@ typedef enum
appInt8x4
=
4
,
appInt8x4
=
4
,
appBFloat16
=
5
,
appBFloat16
=
5
,
appDouble
=
6
,
appDouble
=
6
,
}
appDataType_t
;
};
static
void
check_reduce_dims
(
const
int
rank
,
const
std
::
vector
<
int
>&
reduceDims
)
static
void
check_reduce_dims
(
const
int
rank
,
const
std
::
vector
<
int
>&
reduceDims
)
{
{
...
@@ -119,6 +121,8 @@ class AppArgs
...
@@ -119,6 +121,8 @@ class AppArgs
public:
public:
bool
use_half
=
false
;
bool
use_half
=
false
;
bool
use_double
=
false
;
bool
use_double
=
false
;
bool
use_int8
=
false
;
bool
use_bf16
=
false
;
std
::
vector
<
size_t
>
inLengths
;
std
::
vector
<
size_t
>
inLengths
;
std
::
vector
<
size_t
>
outLengths
;
std
::
vector
<
size_t
>
outLengths
;
...
@@ -127,8 +131,8 @@ class AppArgs
...
@@ -127,8 +131,8 @@ class AppArgs
std
::
vector
<
float
>
scales
;
std
::
vector
<
float
>
scales
;
ReduceTensorOp_t
reduceOp
=
ReduceTensorOp_t
::
ADD
;
ReduceTensorOp_t
reduceOp
=
ReduceTensorOp_t
::
ADD
;
appDataType_t
compTypeId
=
appFloat
;
appDataType_t
compTypeId
=
appDataType_t
::
appFloat
;
appDataType_t
outTypeId
=
appFloat
;
appDataType_t
outTypeId
=
appDataType_t
::
appFloat
;
bool
compType_assigned
=
false
;
bool
compType_assigned
=
false
;
bool
outType_assigned
=
false
;
bool
outType_assigned
=
false
;
...
@@ -169,6 +173,8 @@ class AppArgs
...
@@ -169,6 +173,8 @@ class AppArgs
<<
std
::
endl
;
<<
std
::
endl
;
std
::
cout
<<
"--half, use fp16 for the input and output tensor data types"
<<
std
::
endl
;
std
::
cout
<<
"--half, use fp16 for the input and output tensor data types"
<<
std
::
endl
;
std
::
cout
<<
"--double, use fp64 for the input and output tensor data types"
<<
std
::
endl
;
std
::
cout
<<
"--double, use fp64 for the input and output tensor data types"
<<
std
::
endl
;
std
::
cout
<<
"--int8, use int8 for the input and output tensor data types"
<<
std
::
endl
;
std
::
cout
<<
"--bf16, use bfloat16 for the input and output tensor data types"
<<
std
::
endl
;
std
::
cout
<<
"--verify or -v, 1/0 to indicate whether to verify the reduction result by "
std
::
cout
<<
"--verify or -v, 1/0 to indicate whether to verify the reduction result by "
"comparing with the host-based reduction"
"comparing with the host-based reduction"
<<
std
::
endl
;
<<
std
::
endl
;
...
@@ -267,6 +273,10 @@ class AppArgs
...
@@ -267,6 +273,10 @@ class AppArgs
use_half
=
true
;
use_half
=
true
;
else
if
(
std
::
string
(
long_options
[
option_index
].
name
)
==
"double"
)
else
if
(
std
::
string
(
long_options
[
option_index
].
name
)
==
"double"
)
use_double
=
true
;
use_double
=
true
;
else
if
(
std
::
string
(
long_options
[
option_index
].
name
)
==
"int8"
)
use_int8
=
true
;
else
if
(
std
::
string
(
long_options
[
option_index
].
name
)
==
"bf16"
)
use_bf16
=
true
;
else
if
(
std
::
string
(
long_options
[
option_index
].
name
)
==
"help"
)
else
if
(
std
::
string
(
long_options
[
option_index
].
name
)
==
"help"
)
{
{
show_usage
(
argv
[
0
]);
show_usage
(
argv
[
0
]);
...
@@ -329,15 +339,16 @@ int profile_reduce(int argc, char* argv[])
...
@@ -329,15 +339,16 @@ int profile_reduce(int argc, char* argv[])
if
(
args
.
use_half
)
if
(
args
.
use_half
)
{
{
if
(
!
args
.
compType_assigned
)
if
(
!
args
.
compType_assigned
)
args
.
compTypeId
=
appHalf
;
args
.
compTypeId
=
appDataType_t
::
appHalf
;
if
(
args
.
outType_assigned
&&
(
args
.
outTypeId
!=
appHalf
&&
args
.
outTypeId
!=
appFloat
))
if
(
args
.
outType_assigned
&&
args
.
outTypeId
=
appFloat
;
(
args
.
outTypeId
!=
appDataType_t
::
appHalf
&&
args
.
outTypeId
!=
appDataType_t
::
appFloat
))
args
.
outTypeId
=
appDataType_t
::
appFloat
;
if
(
!
args
.
outType_assigned
)
if
(
!
args
.
outType_assigned
)
args
.
outTypeId
=
appHalf
;
args
.
outTypeId
=
appDataType_t
::
appHalf
;
if
(
args
.
compTypeId
==
appHalf
)
if
(
args
.
compTypeId
==
appDataType_t
::
appHalf
)
{
{
profile_reduce_impl
<
ck
::
half_t
,
ck
::
half_t
,
ck
::
half_t
>
(
args
.
do_verification
,
profile_reduce_impl
<
ck
::
half_t
,
ck
::
half_t
,
ck
::
half_t
>
(
args
.
do_verification
,
args
.
init_method
,
args
.
init_method
,
...
@@ -352,7 +363,7 @@ int profile_reduce(int argc, char* argv[])
...
@@ -352,7 +363,7 @@ int profile_reduce(int argc, char* argv[])
args
.
scales
[
0
],
args
.
scales
[
0
],
args
.
scales
[
1
]);
args
.
scales
[
1
]);
}
}
else
if
(
args
.
compTypeId
==
appFloat
)
else
if
(
args
.
compTypeId
==
appDataType_t
::
appFloat
)
{
{
profile_reduce_impl
<
ck
::
half_t
,
float
,
ck
::
half_t
>
(
args
.
do_verification
,
profile_reduce_impl
<
ck
::
half_t
,
float
,
ck
::
half_t
>
(
args
.
do_verification
,
args
.
init_method
,
args
.
init_method
,
...
@@ -385,9 +396,76 @@ int profile_reduce(int argc, char* argv[])
...
@@ -385,9 +396,76 @@ int profile_reduce(int argc, char* argv[])
args
.
scales
[
0
],
args
.
scales
[
0
],
args
.
scales
[
1
]);
args
.
scales
[
1
]);
}
}
else
if
(
args
.
use_int8
)
{
if
(
!
args
.
compType_assigned
)
args
.
compTypeId
=
appDataType_t
::
appInt8
;
if
(
args
.
outType_assigned
&&
(
args
.
outTypeId
!=
appDataType_t
::
appInt8
&&
args
.
outTypeId
!=
appDataType_t
::
appInt32
))
args
.
outTypeId
=
appDataType_t
::
appInt32
;
if
(
!
args
.
outType_assigned
)
args
.
outTypeId
=
appDataType_t
::
appInt8
;
if
(
args
.
compTypeId
==
appDataType_t
::
appInt8
)
{
profile_reduce_impl
<
int8_t
,
int8_t
,
int8_t
>
(
args
.
do_verification
,
args
.
init_method
,
args
.
do_log
,
args
.
do_dumpout
,
args
.
nrepeat
,
args
.
inLengths
,
args
.
reduceDims
,
args
.
reduceOp
,
args
.
nanOpt
,
args
.
indicesOpt
,
args
.
scales
[
0
],
args
.
scales
[
1
]);
}
else
if
(
args
.
compTypeId
==
appDataType_t
::
appInt32
)
{
profile_reduce_impl
<
int8_t
,
int32_t
,
int8_t
>
(
args
.
do_verification
,
args
.
init_method
,
args
.
do_log
,
args
.
do_dumpout
,
args
.
nrepeat
,
args
.
inLengths
,
args
.
reduceDims
,
args
.
reduceOp
,
args
.
nanOpt
,
args
.
indicesOpt
,
args
.
scales
[
0
],
args
.
scales
[
1
]);
}
else
throw
std
::
runtime_error
(
"Invalid compType assignment!"
);
}
else
if
(
args
.
use_bf16
)
{
if
(
args
.
outType_assigned
&&
(
args
.
outTypeId
!=
appDataType_t
::
appBFloat16
&&
args
.
outTypeId
!=
appDataType_t
::
appFloat
))
args
.
outTypeId
=
appDataType_t
::
appFloat
;
if
(
!
args
.
outType_assigned
)
args
.
outTypeId
=
appDataType_t
::
appBFloat16
;
profile_reduce_impl
<
ck
::
bhalf_t
,
float
,
ck
::
bhalf_t
>
(
args
.
do_verification
,
args
.
init_method
,
args
.
do_log
,
args
.
do_dumpout
,
args
.
nrepeat
,
args
.
inLengths
,
args
.
reduceDims
,
args
.
reduceOp
,
args
.
nanOpt
,
args
.
indicesOpt
,
args
.
scales
[
0
],
args
.
scales
[
1
]);
}
else
else
{
{
if
(
args
.
compTypeId
==
appFloat
)
if
(
args
.
compTypeId
==
appDataType_t
::
appFloat
)
{
{
profile_reduce_impl
<
float
,
float
,
float
>
(
args
.
do_verification
,
profile_reduce_impl
<
float
,
float
,
float
>
(
args
.
do_verification
,
args
.
init_method
,
args
.
init_method
,
...
@@ -402,7 +480,7 @@ int profile_reduce(int argc, char* argv[])
...
@@ -402,7 +480,7 @@ int profile_reduce(int argc, char* argv[])
args
.
scales
[
0
],
args
.
scales
[
0
],
args
.
scales
[
1
]);
args
.
scales
[
1
]);
}
}
else
if
(
args
.
compTypeId
==
appDouble
)
else
if
(
args
.
compTypeId
==
appDataType_t
::
appDouble
)
{
{
profile_reduce_impl
<
float
,
double
,
float
>
(
args
.
do_verification
,
profile_reduce_impl
<
float
,
double
,
float
>
(
args
.
do_verification
,
args
.
init_method
,
args
.
init_method
,
...
...
profiler/src/profiler.cpp
View file @
e72c0c43
...
@@ -5,10 +5,12 @@
...
@@ -5,10 +5,12 @@
#include <cstring>
#include <cstring>
int
profile_gemm
(
int
,
char
*
[]);
int
profile_gemm
(
int
,
char
*
[]);
int
profile_batched_gemm
(
int
,
char
*
[]);
int
profile_gemm_bias_2d
(
int
,
char
*
[]);
int
profile_gemm_bias_2d
(
int
,
char
*
[]);
int
profile_gemm_bias_relu
(
int
,
char
*
[]);
int
profile_gemm_bias_relu
(
int
,
char
*
[]);
int
profile_gemm_bias_relu_add
(
int
,
char
*
[]);
int
profile_gemm_bias_relu_add
(
int
,
char
*
[]);
int
profile_gemm_reduce
(
int
,
char
*
[]);
int
profile_batched_gemm
(
int
,
char
*
[]);
int
profile_grouped_gemm
(
int
,
char
*
[]);
int
profile_conv_fwd
(
int
,
char
*
[]);
int
profile_conv_fwd
(
int
,
char
*
[]);
int
profile_conv_fwd_bias_relu
(
int
,
char
*
[]);
int
profile_conv_fwd_bias_relu
(
int
,
char
*
[]);
int
profile_conv_fwd_bias_relu_add
(
int
,
char
*
[]);
int
profile_conv_fwd_bias_relu_add
(
int
,
char
*
[]);
...
@@ -34,10 +36,18 @@ int main(int argc, char* argv[])
...
@@ -34,10 +36,18 @@ int main(int argc, char* argv[])
{
{
return
profile_gemm_bias_relu_add
(
argc
,
argv
);
return
profile_gemm_bias_relu_add
(
argc
,
argv
);
}
}
else
if
(
strcmp
(
argv
[
1
],
"gemm_reduce"
)
==
0
)
{
return
profile_gemm_reduce
(
argc
,
argv
);
}
else
if
(
strcmp
(
argv
[
1
],
"batched_gemm"
)
==
0
)
else
if
(
strcmp
(
argv
[
1
],
"batched_gemm"
)
==
0
)
{
{
return
profile_batched_gemm
(
argc
,
argv
);
return
profile_batched_gemm
(
argc
,
argv
);
}
}
else
if
(
strcmp
(
argv
[
1
],
"grouped_gemm"
)
==
0
)
{
profile_grouped_gemm
(
argc
,
argv
);
}
else
if
(
strcmp
(
argv
[
1
],
"conv_fwd"
)
==
0
)
else
if
(
strcmp
(
argv
[
1
],
"conv_fwd"
)
==
0
)
{
{
return
profile_conv_fwd
(
argc
,
argv
);
return
profile_conv_fwd
(
argc
,
argv
);
...
@@ -69,12 +79,14 @@ int main(int argc, char* argv[])
...
@@ -69,12 +79,14 @@ int main(int argc, char* argv[])
" gemm_bias_2d: GEMM+Bias(2D)
\n
"
" gemm_bias_2d: GEMM+Bias(2D)
\n
"
" gemm_bias_relu: GEMM+Bias+ReLU
\n
"
" gemm_bias_relu: GEMM+Bias+ReLU
\n
"
" gemm_bias_relu_add: GEMM+Bias+ReLU+Add
\n
"
" gemm_bias_relu_add: GEMM+Bias+ReLU+Add
\n
"
" gemm_reduce: GEMM+Reduce
\n
"
" grouped_gemm: Grouped Gemm
\n
"
" conv_fwd: ForwardConvolution
\n
"
" conv_fwd: ForwardConvolution
\n
"
" conv_fwd_bias_relu: ForwardConvolution+Bias+ReLU
\n
"
" conv_fwd_bias_relu: ForwardConvolution+Bias+ReLU
\n
"
" conv_fwd_bias_relu_add: ForwardConvolution+Bias+ReLU+Add
\n
"
" conv_fwd_bias_relu_add: ForwardConvolution+Bias+ReLU+Add
\n
"
" conv_fwd_bias_relu_atomic_add: ForwardConvolution+Bias+ReLU+AtomicAdd
\n
"
" conv_fwd_bias_relu_atomic_add: ForwardConvolution+Bias+ReLU+AtomicAdd
\n
"
" conv_bwd: BackwardConvolution
\n
"
" conv_bwd: BackwardConvolution
\n
"
" reduce: R
EDUCE
\n
"
);
" reduce: R
educe
\n
"
);
// clang-format on
// clang-format on
return
0
;
return
0
;
...
...
script/cmake-rocm.sh
View file @
e72c0c43
...
@@ -3,14 +3,14 @@ rm -f CMakeCache.txt
...
@@ -3,14 +3,14 @@ rm -f CMakeCache.txt
rm
-f
*
.cmake
rm
-f
*
.cmake
rm
-rf
CMakeFiles
rm
-rf
CMakeFiles
MY_PROJECT_SOURCE
=
../
../..
MY_PROJECT_SOURCE
=
../
MY_PROJECT_INSTALL
=
../install.dir
MY_PROJECT_INSTALL
=
../install.dir
cmake
\
cmake
\
-D
CMAKE_INSTALL_PREFIX
=
${
MY_PROJECT_INSTALL
}
\
-D
CMAKE_INSTALL_PREFIX
=
${
MY_PROJECT_INSTALL
}
\
-D
BUILD_DEV
=
OFF
\
-D
BUILD_DEV
=
OFF
\
-D
CMAKE_BUILD_TYPE
=
Release
\
-D
CMAKE_BUILD_TYPE
=
Release
\
-D
CMAKE_CXX_FLAGS
=
"-DCK_AMD_GPU_GFX908 --amdgpu-target=gfx908 -O3 -ftemplate-backtrace-limit=0 -mllvm --amdgpu-spill-vgpr-to-agpr=0 -gline-tables-only
-save-temps=
$PWD
"
\
-D
CMAKE_CXX_FLAGS
=
"-DCK_AMD_GPU_GFX908 --amdgpu-target=gfx908 -O3 -ftemplate-backtrace-limit=0 -mllvm --amdgpu-spill-vgpr-to-agpr=0 -gline-tables-only "
\
-D
CMAKE_CXX_COMPILER
=
/opt/rocm/bin/hipcc
\
-D
CMAKE_CXX_COMPILER
=
/opt/rocm/bin/hipcc
\
-D
CMAKE_PREFIX_PATH
=
/opt/rocm
\
-D
CMAKE_PREFIX_PATH
=
/opt/rocm
\
-D
CMAKE_VERBOSE_MAKEFILE:BOOL
=
ON
\
-D
CMAKE_VERBOSE_MAKEFILE:BOOL
=
ON
\
...
...
script/profile_reduce_no_index.sh
View file @
e72c0c43
...
@@ -3,13 +3,16 @@
...
@@ -3,13 +3,16 @@
PRECISION
=
PRECISION
=
##PRECISION=--half
##PRECISION=--half
##PRECISION=--double
##PRECISION=--double
##PRECISION=--int8
##PRECISION=--bf16
if
test
-n
$PRECISION
&&
test
"
$PRECISION
"
=
"--half"
;
then
if
[
-n
$PRECISION
]
&&
[
"
$PRECISION
"
=
"--half"
-o
"
$PRECISION
"
=
"--bf16"
]
;
then
ACCTYPE
=
"-C 1"
ACCTYPE
=
"-C 1"
el
se
el
if
[
-n
$PRECISION
]
&&
[
"
$PRECISION
"
=
"--int8"
]
;
then
ACCTYPE
=
""
ACCTYPE
=
"
-C 2
"
fi
fi
driver
=
"./bin/ckProfiler"
driver
=
"./bin/ckProfiler"
VERIFY
=
"-v
$1
"
VERIFY
=
"-v
$1
"
...
@@ -20,10 +23,16 @@ NREPEAT=$3
...
@@ -20,10 +23,16 @@ NREPEAT=$3
#### 0 - ADD, 5 - AVG, 7 - NORM2
#### 0 - ADD, 5 - AVG, 7 - NORM2
Operations
=
"0 5 7"
Operations
=
"0 5 7"
#### 0 - ADD, 5 - AVG, for int8, no NORM2 supported
if
[
-n
$PRECISION
]
&&
[
"
$PRECISION
"
=
"--int8"
]
;
then
Operations
=
5
fi
## for generic validation
## for generic validation
for
op
in
$Operations
;
do
for
op
in
$Operations
;
do
set
-x
set
-x
####### datatype layout reduce dims op acctype verify init repeats
####### datatype layout reduce dims op acctype verify init repeats
$driver
reduce
$PRECISION
-D
64,4,280,82
-R
0,1,2,3
-O
$op
$ACCTYPE
$VERIFY
$INIT
$NREPEAT
$driver
reduce
$PRECISION
-D
64,4,280,82
-R
0
-O
$op
$ACCTYPE
$VERIFY
$INIT
$NREPEAT
$driver
reduce
$PRECISION
-D
64,4,280,82
-R
0
-O
$op
$ACCTYPE
$VERIFY
$INIT
$NREPEAT
$driver
reduce
$PRECISION
-D
64,4,280,82
-R
1
-O
$op
$ACCTYPE
$VERIFY
$INIT
$NREPEAT
$driver
reduce
$PRECISION
-D
64,4,280,82
-R
1
-O
$op
$ACCTYPE
$VERIFY
$INIT
$NREPEAT
$driver
reduce
$PRECISION
-D
64,4,280,82
-R
2
-O
$op
$ACCTYPE
$VERIFY
$INIT
$NREPEAT
$driver
reduce
$PRECISION
-D
64,4,280,82
-R
2
-O
$op
$ACCTYPE
$VERIFY
$INIT
$NREPEAT
...
...
script/profile_reduce_with_index.sh
View file @
e72c0c43
...
@@ -3,6 +3,8 @@
...
@@ -3,6 +3,8 @@
PRECISION
=
PRECISION
=
##PRECISION=--half
##PRECISION=--half
##PRECISION=--double
##PRECISION=--double
##PRECISION=--int8
##PRECISION=--bf16
driver
=
"./bin/ckProfiler"
driver
=
"./bin/ckProfiler"
...
@@ -18,6 +20,7 @@ for op in $Operations; do
...
@@ -18,6 +20,7 @@ for op in $Operations; do
for
use_idx
in
0 1
;
do
for
use_idx
in
0 1
;
do
set
-x
set
-x
####### datatype layout reduce dims op use index verify init repeats
####### datatype layout reduce dims op use index verify init repeats
$driver
reduce
$PRECISION
-D
64,4,280,82
-R
0,1,2,3
-O
$op
-I
$use_idx
$VERIFY
$INIT
$NREPEAT
$driver
reduce
$PRECISION
-D
64,4,280,82
-R
0
-O
$op
-I
$use_idx
$VERIFY
$INIT
$NREPEAT
$driver
reduce
$PRECISION
-D
64,4,280,82
-R
0
-O
$op
-I
$use_idx
$VERIFY
$INIT
$NREPEAT
$driver
reduce
$PRECISION
-D
64,4,280,82
-R
1
-O
$op
-I
$use_idx
$VERIFY
$INIT
$NREPEAT
$driver
reduce
$PRECISION
-D
64,4,280,82
-R
1
-O
$op
-I
$use_idx
$VERIFY
$INIT
$NREPEAT
$driver
reduce
$PRECISION
-D
64,4,280,82
-R
2
-O
$op
-I
$use_idx
$VERIFY
$INIT
$NREPEAT
$driver
reduce
$PRECISION
-D
64,4,280,82
-R
2
-O
$op
-I
$use_idx
$VERIFY
$INIT
$NREPEAT
...
...
script/test_convnd_fwd.sh
0 → 100644
View file @
e72c0c43
#!/usr/bin/env bash
# set -e
DIM1
=
False
DIM2
=
True
DIM3
=
False
DATE
=
220317
GIT_HASH
=
4e6dfda
LOG_DIR
=
${
DATE
}
_
${
GIT_HASH
}
SUFFIX
=
${
GIT_HASH
}
#--------------------------------------------------------------------------
# Commandline arguments parsing
# like: cmd -key[--key] value
#--------------------------------------------------------------------------
POSITIONAL
=()
while
[[
$#
-gt
0
]]
do
key
=
"
$1
"
case
$key
in
-d1
|
--d1
)
DIM1
=
True
echo
DIM1:
"
${
DIM1
}
"
shift
# past argument
;;
-d2
|
--d2
)
DIM2
=
True
echo
DIM2:
"
${
DIM2
}
"
shift
# past argument
;;
-d3
|
--d3
)
DIM3
=
True
echo
DIM3:
"
${
DIM3
}
"
shift
# past argument
;;
-all
|
--all
)
DIM1
=
True
DIM2
=
True
DIM3
=
True
echo
DIM1:
"
${
DIM1
}
"
echo
DIM2:
"
${
DIM2
}
"
echo
DIM3:
"
${
DIM3
}
"
shift
# past argument
;;
-s
|
--suffix
)
SUFFIX
=
${
SUFFIX
}
_
"
$2
"
echo
SUFFIX:
"
${
SUFFIX
}
"
shift
# past argument
shift
# past value
;;
*
)
# unknown option
POSITIONAL+
=(
"
$1
"
)
# save it in an array for later
shift
# past argument
;;
esac
done
set
--
"
${
POSITIONAL
[@]
}
"
# restore positional parameters
#--------------------------------------------------------------------------
# NUMACTL="numactl --cpunodebind=1 --membind=1"
NUMACTL
=
# ENV_CONF=
GPU
=
mi100
PROF_ITER_COUNT
=
10000
LOG_DIR_PATH
=
../log/
${
LOG_DIR
}
set
-x
#-------------------------------------------------------------------------------
# 1D
#-------------------------------------------------------------------------------
if
[[
"
${
DIM1
}
"
==
"True"
]]
;
then
mkdir
-p
${
LOG_DIR_PATH
}
echo
">>>>>>>> RUN test conv1d nwc <<<<<<<<<<"
CMD
=
"./../build/bin/test_conv1d_fwd"
${
NUMACTL
}
${
CMD
}
2>&1
\
|
tee
${
LOG_DIR_PATH
}
/test_conv1d_fwd_nwc_
${
SUFFIX
}
_
${
GPU
}
.log
fi
#-------------------------------------------------------------------------------
# 2D
#-------------------------------------------------------------------------------
if
[[
"
${
DIM2
}
"
==
"True"
]]
;
then
mkdir
-p
${
LOG_DIR_PATH
}
echo
">>>>>>>> RUN test conv2d nhwc <<<<<<<<<<"
CMD
=
"./../build/bin/test_conv2d_fwd"
${
NUMACTL
}
${
CMD
}
2>&1
\
|
tee
${
LOG_DIR_PATH
}
/test_conv2d_fwd_nhwc_
${
SUFFIX
}
_
${
GPU
}
.log
fi
#-------------------------------------------------------------------------------
# 3D
#-------------------------------------------------------------------------------
if
[[
"
${
DIM3
}
"
==
"True"
]]
;
then
mkdir
-p
${
LOG_DIR_PATH
}
echo
">>>>>>>> RUN test conv3d ndhwc <<<<<<<<<<"
CMD
=
"./../build/bin/test_conv3d_fwd"
${
NUMACTL
}
${
CMD
}
2>&1
\
|
tee
${
LOG_DIR_PATH
}
/test_conv3d_fwd_ndhwc_
${
SUFFIX
}
_
${
GPU
}
.log
fi
script/test_reduce_no_index.sh
0 → 100755
View file @
e72c0c43
#!/bin/bash
## The following will be used for CI
set
-x
## for float
bin/test_reduce_no_index
-D
64,4,280,82
-R
0,1,2,3 0 2
bin/test_reduce_no_index
-D
64,4,280,82
-R
0,1,2 0 2
bin/test_reduce_no_index
-D
64,4,280,82
-R
0,1,3 0 2
bin/test_reduce_no_index
-D
64,4,280,82
-R
0,2,3 0 2
bin/test_reduce_no_index
-D
64,4,280,82
-R
1,2,3 0 2
bin/test_reduce_no_index
-D
64,4,280,82
-R
0 0 2
bin/test_reduce_no_index
-D
64,4,280,82
-R
1 0 2
bin/test_reduce_no_index
-D
64,4,280,82
-R
2 0 2
bin/test_reduce_no_index
-D
64,4,280,82
-R
3 0 2
## for float16
bin/test_reduce_no_index
-D
64,4,280,82
-R
0,1,2,3 1 2
bin/test_reduce_no_index
-D
64,4,280,82
-R
0,1,2 1 2
bin/test_reduce_no_index
-D
64,4,280,82
-R
0,1,3 1 2
bin/test_reduce_no_index
-D
64,4,280,82
-R
0,2,3 1 2
bin/test_reduce_no_index
-D
64,4,280,82
-R
1,2,3 1 2
bin/test_reduce_no_index
-D
64,4,280,82
-R
0 1 2
bin/test_reduce_no_index
-D
64,4,280,82
-R
1 1 2
bin/test_reduce_no_index
-D
64,4,280,82
-R
2 1 2
bin/test_reduce_no_index
-D
64,4,280,82
-R
3 1 2
## for int8_t
bin/test_reduce_no_index
-D
64,4,280,82
-R
0,1,2,3 3 2
bin/test_reduce_no_index
-D
64,4,280,82
-R
0,1,2 3 2
bin/test_reduce_no_index
-D
64,4,280,82
-R
0,1,3 3 2
bin/test_reduce_no_index
-D
64,4,280,82
-R
0,2,3 3 2
bin/test_reduce_no_index
-D
64,4,280,82
-R
1,2,3 3 2
bin/test_reduce_no_index
-D
64,4,280,82
-R
0 3 2
bin/test_reduce_no_index
-D
64,4,280,82
-R
1 3 2
bin/test_reduce_no_index
-D
64,4,280,82
-R
2 3 2
bin/test_reduce_no_index
-D
64,4,280,82
-R
3 3 2
## for bfloat16
bin/test_reduce_no_index
-D
64,4,280,82
-R
0,1,2,3 5 2
bin/test_reduce_no_index
-D
64,4,280,82
-R
0,1,2 5 2
bin/test_reduce_no_index
-D
64,4,280,82
-R
0,1,3 5 2
bin/test_reduce_no_index
-D
64,4,280,82
-R
0,2,3 5 2
bin/test_reduce_no_index
-D
64,4,280,82
-R
1,2,3 5 2
bin/test_reduce_no_index
-D
64,4,280,82
-R
0 5 2
bin/test_reduce_no_index
-D
64,4,280,82
-R
1 5 2
bin/test_reduce_no_index
-D
64,4,280,82
-R
2 5 2
bin/test_reduce_no_index
-D
64,4,280,82
-R
3 5 2
set
+x
script/test_reduce_with_index.sh
0 → 100755
View file @
e72c0c43
#!/bin/bash
## The following will be used for CI
set
-x
## for float
bin/test_reduce_with_index
-D
64,4,280,82
-R
0,1,2,3 0 2
bin/test_reduce_with_index
-D
64,4,280,82
-R
0,1,2 0 2
bin/test_reduce_with_index
-D
64,4,280,82
-R
0,1,3 0 2
bin/test_reduce_with_index
-D
64,4,280,82
-R
0,2,3 0 2
bin/test_reduce_with_index
-D
64,4,280,82
-R
1,2,3 0 2
bin/test_reduce_with_index
-D
64,4,280,82
-R
0 0 2
bin/test_reduce_with_index
-D
64,4,280,82
-R
1 0 2
bin/test_reduce_with_index
-D
64,4,280,82
-R
2 0 2
bin/test_reduce_with_index
-D
64,4,280,82
-R
3 0 2
## for float16
bin/test_reduce_with_index
-D
64,4,280,82
-R
0,1,2,3 1 2
bin/test_reduce_with_index
-D
64,4,280,82
-R
0,1,2 1 2
bin/test_reduce_with_index
-D
64,4,280,82
-R
0,1,3 1 2
bin/test_reduce_with_index
-D
64,4,280,82
-R
0,2,3 1 2
bin/test_reduce_with_index
-D
64,4,280,82
-R
1,2,3 1 2
bin/test_reduce_with_index
-D
64,4,280,82
-R
0 1 2
bin/test_reduce_with_index
-D
64,4,280,82
-R
1 1 2
bin/test_reduce_with_index
-D
64,4,280,82
-R
2 1 2
bin/test_reduce_with_index
-D
64,4,280,82
-R
3 1 2
## for int8_t
bin/test_reduce_with_index
-D
64,4,280,82
-R
0,1,2,3 3 2
bin/test_reduce_with_index
-D
64,4,280,82
-R
0,1,2 3 2
bin/test_reduce_with_index
-D
64,4,280,82
-R
0,1,3 3 2
bin/test_reduce_with_index
-D
64,4,280,82
-R
0,2,3 3 2
bin/test_reduce_with_index
-D
64,4,280,82
-R
1,2,3 3 2
bin/test_reduce_with_index
-D
64,4,280,82
-R
0 3 2
bin/test_reduce_with_index
-D
64,4,280,82
-R
1 3 2
bin/test_reduce_with_index
-D
64,4,280,82
-R
2 3 2
bin/test_reduce_with_index
-D
64,4,280,82
-R
3 3 2
## for bfloat16
bin/test_reduce_with_index
-D
64,4,280,82
-R
0,1,2,3 5 2
bin/test_reduce_with_index
-D
64,4,280,82
-R
0,1,2 5 2
bin/test_reduce_with_index
-D
64,4,280,82
-R
0,1,3 5 2
bin/test_reduce_with_index
-D
64,4,280,82
-R
0,2,3 5 2
bin/test_reduce_with_index
-D
64,4,280,82
-R
1,2,3 5 2
bin/test_reduce_with_index
-D
64,4,280,82
-R
0 5 2
bin/test_reduce_with_index
-D
64,4,280,82
-R
1 5 2
bin/test_reduce_with_index
-D
64,4,280,82
-R
2 5 2
bin/test_reduce_with_index
-D
64,4,280,82
-R
3 5 2
set
+x
test/CMakeLists.txt
View file @
e72c0c43
...
@@ -17,6 +17,7 @@ include_directories(BEFORE
...
@@ -17,6 +17,7 @@ include_directories(BEFORE
${
PROJECT_SOURCE_DIR
}
/library/include/ck/library/reference_tensor_operation/cpu
${
PROJECT_SOURCE_DIR
}
/library/include/ck/library/reference_tensor_operation/cpu
${
PROJECT_SOURCE_DIR
}
/library/include/ck/library/reference_tensor_operation/gpu
${
PROJECT_SOURCE_DIR
}
/library/include/ck/library/reference_tensor_operation/gpu
${
PROJECT_SOURCE_DIR
}
/test/include
${
PROJECT_SOURCE_DIR
}
/test/include
${
PROJECT_SOURCE_DIR
}
/profiler/include
${
PROJECT_SOURCE_DIR
}
/external/include/half
${
PROJECT_SOURCE_DIR
}
/external/include/half
)
)
...
@@ -37,7 +38,10 @@ add_subdirectory(conv_util)
...
@@ -37,7 +38,10 @@ add_subdirectory(conv_util)
add_subdirectory
(
reference_conv_fwd
)
add_subdirectory
(
reference_conv_fwd
)
add_subdirectory
(
gemm
)
add_subdirectory
(
gemm
)
add_subdirectory
(
gemm_split_k
)
add_subdirectory
(
gemm_split_k
)
add_subdirectory
(
conv2d_fwd
)
add_subdirectory
(
gemm_reduce
)
add_subdirectory
(
batched_gemm
)
add_subdirectory
(
grouped_gemm
)
add_subdirectory
(
convnd_fwd
)
add_subdirectory
(
convnd_fwd
)
add_subdirectory
(
conv2d_bwd_data
)
add_subdirectory
(
conv2d_bwd_data
)
add_subdirectory
(
reduce
)
add_subdirectory
(
cpu_ukernel
)
add_subdirectory
(
cpu_ukernel
)
test/batched_gemm/CMakeLists.txt
0 → 100644
View file @
e72c0c43
add_test_executable
(
test_batched_gemm_fp16 batched_gemm_fp16.cpp
)
target_link_libraries
(
test_batched_gemm_fp16 PRIVATE host_tensor
)
target_link_libraries
(
test_batched_gemm_fp16 PRIVATE device_batched_gemm_instance
)
test/batched_gemm/batched_gemm_fp16.cpp
0 → 100644
View file @
e72c0c43
#include <half.hpp>
#include <tuple>
#include <vector>
#include "batched_gemm_util.hpp"
#include "reference_batched_gemm.hpp"
#include "config.hpp"
#include "device.hpp"
#include "host_tensor.hpp"
#include "host_tensor_generator.hpp"
#include "device_tensor.hpp"
#include "device_batched_gemm_xdl.hpp"
#include "element_wise_operation.hpp"
#include "test_util.hpp"
using
PassThrough
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
DeviceBatchedGemmPtr
=
ck
::
tensor_operation
::
device
::
DeviceGemmPtr
<
ck
::
tensor_operation
::
element_wise
::
PassThrough
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
>
;
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
device_batched_gemm_instance
{
void
add_device_batched_gemm_xdl_f16_f16_f16_gmk_gnk_gmn_instances
(
std
::
vector
<
DeviceBatchedGemmPtr
>&
instances
);
}
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
namespace
{
using
ADataType
=
ck
::
half_t
;
using
BDataType
=
ck
::
half_t
;
using
CDataType
=
ck
::
half_t
;
using
AccDataType
=
float
;
using
ALayout
=
ck
::
tensor_layout
::
gemm
::
RowMajor
;
using
BLayout
=
ck
::
tensor_layout
::
gemm
::
ColumnMajor
;
using
CLayout
=
ck
::
tensor_layout
::
gemm
::
RowMajor
;
auto
PrepareGemmTensor
(
const
std
::
size_t
batch_count
,
const
ck
::
batched_gemm_util
::
GemmParams
&
params
)
{
auto
f_host_tensor_descriptor
=
[
batch_count
](
std
::
size_t
row
,
std
::
size_t
col
,
std
::
size_t
stride
,
auto
layout
)
{
if
(
std
::
is_same
<
decltype
(
layout
),
ck
::
tensor_layout
::
gemm
::
RowMajor
>::
value
)
{
return
HostTensorDescriptor
(
std
::
vector
<
std
::
size_t
>
({
batch_count
,
row
,
col
}),
std
::
vector
<
std
::
size_t
>
({
row
*
stride
,
stride
,
1
}));
}
else
{
return
HostTensorDescriptor
(
std
::
vector
<
std
::
size_t
>
({
batch_count
,
row
,
col
}),
std
::
vector
<
std
::
size_t
>
({
col
*
stride
,
1
,
stride
}));
}
};
Tensor
<
ADataType
>
a_g_m_k
(
f_host_tensor_descriptor
(
params
.
M
,
params
.
K
,
params
.
StrideA
,
ALayout
{}));
Tensor
<
BDataType
>
b_g_k_n
(
f_host_tensor_descriptor
(
params
.
K
,
params
.
N
,
params
.
StrideB
,
BLayout
{}));
Tensor
<
CDataType
>
c_g_m_n_host_result
(
f_host_tensor_descriptor
(
params
.
M
,
params
.
N
,
params
.
StrideC
,
CLayout
{}));
Tensor
<
CDataType
>
c_g_m_n_device_result
(
f_host_tensor_descriptor
(
params
.
M
,
params
.
N
,
params
.
StrideC
,
CLayout
{}));
a_g_m_k
.
GenerateTensorValue
(
GeneratorTensor_3
<
ADataType
>
{
-
0.5
,
0.5
});
b_g_k_n
.
GenerateTensorValue
(
GeneratorTensor_3
<
BDataType
>
{
-
0.5
,
0.5
});
return
std
::
make_tuple
(
a_g_m_k
,
b_g_k_n
,
c_g_m_n_host_result
,
c_g_m_n_device_result
);
}
bool
TestBatchedGemm
(
const
std
::
size_t
batch_count
,
DeviceBatchedGemmPtr
&
gemmPtr
)
{
// Arrange
ck
::
batched_gemm_util
::
GemmParams
params
;
params
.
M
=
1024
;
params
.
N
=
1024
;
params
.
K
=
1024
;
params
.
StrideA
=
1024
;
params
.
StrideB
=
1024
;
params
.
StrideC
=
1024
;
auto
host_tensors
=
PrepareGemmTensor
(
batch_count
,
params
);
const
Tensor
<
ADataType
>&
a
=
std
::
get
<
0
>
(
host_tensors
);
const
Tensor
<
BDataType
>&
b
=
std
::
get
<
1
>
(
host_tensors
);
Tensor
<
CDataType
>&
c_host
=
std
::
get
<
2
>
(
host_tensors
);
Tensor
<
CDataType
>&
c_device
=
std
::
get
<
3
>
(
host_tensors
);
auto
a_element_op
=
PassThrough
{};
auto
b_element_op
=
PassThrough
{};
auto
c_element_op
=
PassThrough
{};
using
ReferenceBatchedGemmInstance
=
ck
::
tensor_operation
::
host
::
ReferenceBatchedGemm
<
ADataType
,
BDataType
,
CDataType
,
PassThrough
,
PassThrough
,
PassThrough
>
;
ck
::
batched_gemm_util
::
RunHostBatchedGemm
<
ReferenceBatchedGemmInstance
>
(
a
,
b
,
c_host
,
a_element_op
,
b_element_op
,
c_element_op
);
// Act
ck
::
batched_gemm_util
::
RunDeviceBatchedGemm
(
gemmPtr
,
params
,
a
,
b
,
c_device
,
a_element_op
,
b_element_op
,
c_element_op
);
// Assert
// bool pass = test::check_err(
// c_device.mData, c_host.mData, "Error: incorrect results!", 1e-5f, 1e-4f);
bool
pass
=
check_error
(
c_device
,
c_host
)
<
0.007815
f
;
std
::
cout
<<
(
pass
?
"SUCCESS"
:
"FAILURE"
)
<<
std
::
endl
;
return
pass
;
}
}
// namespace
int
main
()
{
std
::
vector
<
DeviceBatchedGemmPtr
>
batched_gemm_ptrs
;
ck
::
tensor_operation
::
device
::
device_batched_gemm_instance
::
add_device_batched_gemm_xdl_f16_f16_f16_gmk_gnk_gmn_instances
(
batched_gemm_ptrs
);
bool
pass
=
true
;
const
std
::
size_t
batch_count
=
4
;
for
(
auto
&
gemmPtr
:
batched_gemm_ptrs
)
{
pass
&=
TestBatchedGemm
(
batch_count
,
gemmPtr
);
}
std
::
cout
<<
"TestGemm ..... "
<<
(
pass
?
"SUCCESS"
:
"FAILURE"
)
<<
std
::
endl
;
return
pass
?
0
:
1
;
}
test/batched_gemm/batched_gemm_util.hpp
0 → 100644
View file @
e72c0c43
#ifndef BATCHED_GEMM_UTILS_HPP
#define BATCHED_GEMM_UTILS_HPP
#include "config.hpp"
#include "device.hpp"
#include "host_tensor.hpp"
namespace
ck
{
namespace
batched_gemm_util
{
struct
GemmParams
{
GemmParams
()
:
M
(
1024
),
N
(
1024
),
K
(
1024
),
StrideA
(
1024
),
StrideB
(
1024
),
StrideC
(
1024
),
alpha
(
1
),
beta
(
0
)
{
}
ck
::
index_t
M
;
ck
::
index_t
N
;
ck
::
index_t
K
;
ck
::
index_t
StrideA
;
ck
::
index_t
StrideB
;
ck
::
index_t
StrideC
;
float
alpha
;
float
beta
;
};
template
<
typename
BatchedGemmInstance
,
typename
ADataType
,
typename
BDataType
,
typename
CDataType
,
typename
AElementwiseOperation
,
typename
BElementwiseOperation
,
typename
CElementwiseOperation
>
void
RunHostBatchedGemm
(
const
Tensor
<
ADataType
>&
A
,
const
Tensor
<
BDataType
>&
B
,
Tensor
<
CDataType
>&
C
,
AElementwiseOperation
a_element_op
,
BElementwiseOperation
b_element_op
,
CElementwiseOperation
c_element_op
)
{
auto
ref_batched_gemm
=
BatchedGemmInstance
{};
auto
ref_invoker
=
ref_batched_gemm
.
MakeInvoker
();
auto
ref_argument
=
ref_batched_gemm
.
MakeArgument
(
A
,
B
,
C
,
a_element_op
,
b_element_op
,
c_element_op
);
ref_invoker
.
Run
(
ref_argument
);
}
template
<
typename
DeviceGemmPtr
,
typename
ADataType
,
typename
BDataType
,
typename
CDataType
,
typename
AElementwiseOperation
,
typename
BElementwiseOperation
,
typename
CElementwiseOperation
>
void
RunDeviceBatchedGemm
(
DeviceGemmPtr
&
batched_gemm_ptr
,
const
ck
::
batched_gemm_util
::
GemmParams
&
params
,
const
Tensor
<
ADataType
>&
A
,
const
Tensor
<
BDataType
>&
B
,
Tensor
<
CDataType
>&
C
,
AElementwiseOperation
a_element_op
,
BElementwiseOperation
b_element_op
,
CElementwiseOperation
c_element_op
)
{
DeviceMem
a_g_m_k_device_buf
(
sizeof
(
ADataType
)
*
A
.
mDesc
.
GetElementSpace
());
DeviceMem
b_g_k_n_device_buf
(
sizeof
(
BDataType
)
*
B
.
mDesc
.
GetElementSpace
());
DeviceMem
c_g_m_n_device_buf
(
sizeof
(
CDataType
)
*
C
.
mDesc
.
GetElementSpace
());
a_g_m_k_device_buf
.
ToDevice
(
A
.
mData
.
data
());
b_g_k_n_device_buf
.
ToDevice
(
B
.
mData
.
data
());
const
auto
batch_count
=
A
.
mDesc
.
GetLengths
()[
0
];
auto
invoker_ptr
=
batched_gemm_ptr
->
MakeInvokerPointer
();
auto
argument_ptr
=
batched_gemm_ptr
->
MakeArgumentPointer
(
static_cast
<
ADataType
*>
(
a_g_m_k_device_buf
.
GetDeviceBuffer
()),
static_cast
<
BDataType
*>
(
b_g_k_n_device_buf
.
GetDeviceBuffer
()),
static_cast
<
CDataType
*>
(
c_g_m_n_device_buf
.
GetDeviceBuffer
()),
params
.
M
,
params
.
N
,
params
.
K
,
params
.
StrideA
,
params
.
StrideB
,
params
.
StrideC
,
a_element_op
,
b_element_op
,
c_element_op
,
batch_count
);
if
(
!
batched_gemm_ptr
->
IsSupportedArgument
(
argument_ptr
.
get
()))
{
throw
std
::
runtime_error
(
"wrong! device_gemm with the specified compilation parameters does "
"not support this GEMM problem"
);
}
invoker_ptr
->
Run
(
argument_ptr
.
get
());
c_g_m_n_device_buf
.
FromDevice
(
C
.
mData
.
data
());
}
}
// namespace batched_gemm_util
}
// namespace ck
#endif
test/conv2d_bwd_data/conv2d_bwd_data.cpp
View file @
e72c0c43
...
@@ -182,8 +182,8 @@ int main(int argc, char* argv[])
...
@@ -182,8 +182,8 @@ int main(int argc, char* argv[])
out_device_buf
.
ToDevice
(
out_n_k_ho_wo
.
mData
.
data
());
out_device_buf
.
ToDevice
(
out_n_k_ho_wo
.
mData
.
data
());
wei_device_buf
.
ToDevice
(
wei_k_c_y_x
.
mData
.
data
());
wei_device_buf
.
ToDevice
(
wei_k_c_y_x
.
mData
.
data
());
// reset input to zero
in_n_c_hi_wi_device_result
.
GenerateTensorValue
(
GeneratorTensor_1
<
InDataType
>
{
5
});
in_n_c_hi_wi_device_result
.
GenerateTensorValue
(
GeneratorTensor_1
<
InDataType
>
{
0
});
in_device_buf
.
ToDevice
(
in_n_c_hi_wi_device_result
.
mData
.
data
());
in_device_buf
.
ToDevice
(
in_n_c_hi_wi_device_result
.
mData
.
data
());
// get host result
// get host result
...
@@ -225,9 +225,9 @@ int main(int argc, char* argv[])
...
@@ -225,9 +225,9 @@ int main(int argc, char* argv[])
ck
::
tensor_operation
::
device
::
device_conv2d_bwd_data_instance
::
ck
::
tensor_operation
::
device
::
device_conv2d_bwd_data_instance
::
add_device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f16_instances
(
conv_ptrs
);
add_device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f16_instances
(
conv_ptrs
);
}
}
else
if
constexpr
(
ck
::
is_same_v
<
ck
::
remove_cv_t
<
InDataType
>
,
ushor
t
>
&&
else
if
constexpr
(
ck
::
is_same_v
<
ck
::
remove_cv_t
<
InDataType
>
,
ck
::
bhalf_
t
>
&&
ck
::
is_same_v
<
ck
::
remove_cv_t
<
WeiDataType
>
,
ushor
t
>
&&
ck
::
is_same_v
<
ck
::
remove_cv_t
<
WeiDataType
>
,
ck
::
bhalf_
t
>
&&
ck
::
is_same_v
<
ck
::
remove_cv_t
<
OutDataType
>
,
ushor
t
>
)
ck
::
is_same_v
<
ck
::
remove_cv_t
<
OutDataType
>
,
ck
::
bhalf_
t
>
)
{
{
ck
::
tensor_operation
::
device
::
device_conv2d_bwd_data_instance
::
ck
::
tensor_operation
::
device
::
device_conv2d_bwd_data_instance
::
add_device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_bf16_instances
(
conv_ptrs
);
add_device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_bf16_instances
(
conv_ptrs
);
...
...
test/conv2d_fwd/CMakeLists.txt
deleted
100644 → 0
View file @
d714fa15
add_test_executable
(
test_conv2d_fwd conv2d_fwd.cpp
)
target_link_libraries
(
test_conv2d_fwd PRIVATE host_tensor
)
target_link_libraries
(
test_conv2d_fwd PRIVATE device_conv2d_fwd_instance
)
test/conv2d_fwd/conv2d_fwd.cpp
deleted
100644 → 0
View file @
d714fa15
#include "config.hpp"
#include "device.hpp"
#include "host_tensor.hpp"
#include "host_tensor_generator.hpp"
#include "host_conv.hpp"
#include "tensor_layout.hpp"
#include "device_tensor.hpp"
#include "device_conv_fwd.hpp"
#include "element_wise_operation.hpp"
#include "reference_conv_fwd.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
device_conv2d_fwd_instance
{
using
DeviceConvFwdNoOpPtr
=
DeviceConvFwdPtr
<
ck
::
tensor_operation
::
element_wise
::
PassThrough
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
>
;
void
add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f32_instances
(
std
::
vector
<
DeviceConvFwdNoOpPtr
>&
);
void
add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f16_instances
(
std
::
vector
<
DeviceConvFwdNoOpPtr
>&
);
void
add_device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk_f16_instances
(
std
::
vector
<
DeviceConvFwdNoOpPtr
>&
);
void
add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_bf16_instances
(
std
::
vector
<
DeviceConvFwdNoOpPtr
>&
);
void
add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_int8_instances
(
std
::
vector
<
DeviceConvFwdNoOpPtr
>&
);
}
// namespace device_conv2d_fwd_instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
using
InElementOp
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
WeiElementOp
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
OutElementOp
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
template
<
typename
T
>
static
bool
check_out
(
const
Tensor
<
T
>&
ref
,
const
Tensor
<
T
>&
result
)
{
float
max_diff
=
1e-6
;
for
(
int
i
=
0
;
i
<
ref
.
mData
.
size
();
++
i
)
{
float
diff
=
std
::
abs
(
double
(
ref
.
mData
[
i
])
-
double
(
result
.
mData
[
i
]));
if
(
max_diff
<
diff
)
{
return
false
;
}
}
return
true
;
}
int
main
(
int
argc
,
char
*
argv
[])
{
int
data_type
=
0
;
int
init_method
=
0
;
// Conv shape
ck
::
index_t
N
=
128
;
ck
::
index_t
K
=
256
;
ck
::
index_t
C
=
192
;
ck
::
index_t
Y
=
3
;
ck
::
index_t
X
=
3
;
ck
::
index_t
Hi
=
71
;
ck
::
index_t
Wi
=
71
;
ck
::
index_t
conv_stride_h
=
2
;
ck
::
index_t
conv_stride_w
=
2
;
ck
::
index_t
conv_dilation_h
=
1
;
ck
::
index_t
conv_dilation_w
=
1
;
ck
::
index_t
in_left_pad_h
=
1
;
ck
::
index_t
in_left_pad_w
=
1
;
ck
::
index_t
in_right_pad_h
=
1
;
ck
::
index_t
in_right_pad_w
=
1
;
if
(
argc
==
1
)
{
data_type
=
1
;
init_method
=
1
;
}
else
if
(
argc
==
3
)
{
data_type
=
std
::
stoi
(
argv
[
1
]);
init_method
=
std
::
stoi
(
argv
[
2
]);
}
else
if
(
argc
==
18
)
{
data_type
=
std
::
stoi
(
argv
[
1
]);
init_method
=
std
::
stoi
(
argv
[
2
]);
N
=
std
::
stoi
(
argv
[
3
]);
K
=
std
::
stoi
(
argv
[
4
]);
C
=
std
::
stoi
(
argv
[
5
]);
Y
=
std
::
stoi
(
argv
[
6
]);
X
=
std
::
stoi
(
argv
[
7
]);
Hi
=
std
::
stoi
(
argv
[
8
]);
Wi
=
std
::
stoi
(
argv
[
9
]);
conv_stride_h
=
std
::
stoi
(
argv
[
10
]);
conv_stride_w
=
std
::
stoi
(
argv
[
11
]);
conv_dilation_h
=
std
::
stoi
(
argv
[
12
]);
conv_dilation_w
=
std
::
stoi
(
argv
[
13
]);
in_left_pad_h
=
std
::
stoi
(
argv
[
14
]);
in_left_pad_w
=
std
::
stoi
(
argv
[
15
]);
in_right_pad_h
=
std
::
stoi
(
argv
[
16
]);
in_right_pad_w
=
std
::
stoi
(
argv
[
17
]);
}
else
{
printf
(
"arg1: data type (0=fp32, 1=fp16, 2= bfp16, 3= int8_t )
\n
"
);
printf
(
"arg2: initialization (0=no init, 1=integer value, 2=decimal value)
\n
"
);
printf
(
"arg3 to 17: N, K, C, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, "
"RightPx
\n
"
);
exit
(
1
);
}
auto
Run
=
[
&
](
auto
input_type
,
auto
wei_type
,
auto
out_type
)
{
using
InDataType
=
decltype
(
input_type
);
using
WeiDataType
=
decltype
(
wei_type
);
using
OutDataType
=
decltype
(
out_type
);
using
ReferenceConvFwdInstance
=
ck
::
tensor_operation
::
host
::
ReferenceConvFwd
<
InDataType
,
WeiDataType
,
OutDataType
,
InElementOp
,
WeiElementOp
,
OutElementOp
>
;
const
ck
::
index_t
YEff
=
(
Y
-
1
)
*
conv_dilation_h
+
1
;
const
ck
::
index_t
XEff
=
(
X
-
1
)
*
conv_dilation_w
+
1
;
const
ck
::
index_t
Ho
=
(
Hi
+
in_left_pad_h
+
in_right_pad_h
-
YEff
)
/
conv_stride_h
+
1
;
const
ck
::
index_t
Wo
=
(
Wi
+
in_left_pad_w
+
in_right_pad_w
-
XEff
)
/
conv_stride_w
+
1
;
const
std
::
vector
<
ck
::
index_t
>
input_spatial_lengths
{
Hi
,
Wi
};
const
std
::
vector
<
ck
::
index_t
>
filter_spatial_lengths
{
Y
,
X
};
const
std
::
vector
<
ck
::
index_t
>
output_spatial_lengths
{
Ho
,
Wo
};
const
std
::
vector
<
ck
::
index_t
>
conv_filter_strides
{
conv_stride_h
,
conv_stride_w
};
const
std
::
vector
<
ck
::
index_t
>
conv_filter_dilations
{
conv_dilation_h
,
conv_dilation_w
};
const
std
::
vector
<
ck
::
index_t
>
input_left_pads
{
in_left_pad_h
,
in_left_pad_w
};
const
std
::
vector
<
ck
::
index_t
>
input_right_pads
{
in_right_pad_h
,
in_right_pad_w
};
auto
f_host_tensor_descriptor
=
[](
std
::
size_t
N_
,
std
::
size_t
C_
,
std
::
size_t
H
,
std
::
size_t
W
)
{
return
HostTensorDescriptor
(
std
::
vector
<
std
::
size_t
>
({
N_
,
C_
,
H
,
W
}),
std
::
vector
<
std
::
size_t
>
({
C_
*
H
*
W
,
1
,
W
*
C_
,
C_
}));
};
Tensor
<
InDataType
>
in_n_c_hi_wi
(
f_host_tensor_descriptor
(
N
,
C
,
Hi
,
Wi
));
Tensor
<
WeiDataType
>
wei_k_c_y_x
(
f_host_tensor_descriptor
(
K
,
C
,
Y
,
X
));
Tensor
<
OutDataType
>
out_n_k_ho_wo_host_result
(
f_host_tensor_descriptor
(
N
,
K
,
Ho
,
Wo
));
Tensor
<
OutDataType
>
out_n_k_ho_wo_device_result
(
f_host_tensor_descriptor
(
N
,
K
,
Ho
,
Wo
));
std
::
cout
<<
"in_n_c_hi_wi: "
<<
in_n_c_hi_wi
.
mDesc
<<
std
::
endl
;
std
::
cout
<<
"wei_k_c_y_x: "
<<
wei_k_c_y_x
.
mDesc
<<
std
::
endl
;
std
::
cout
<<
"out_n_k_ho_wo: "
<<
out_n_k_ho_wo_host_result
.
mDesc
<<
std
::
endl
;
switch
(
init_method
)
{
case
0
:
break
;
case
1
:
in_n_c_hi_wi
.
GenerateTensorValue
(
GeneratorTensor_2
<
InDataType
>
{
-
5
,
5
});
wei_k_c_y_x
.
GenerateTensorValue
(
GeneratorTensor_2
<
WeiDataType
>
{
-
5
,
5
});
break
;
default:
in_n_c_hi_wi
.
GenerateTensorValue
(
GeneratorTensor_3
<
InDataType
>
{
0
,
1
});
wei_k_c_y_x
.
GenerateTensorValue
(
GeneratorTensor_3
<
WeiDataType
>
{
-
1
,
1
});
}
DeviceMem
in_device_buf
(
sizeof
(
InDataType
)
*
in_n_c_hi_wi
.
mDesc
.
GetElementSpace
());
DeviceMem
wei_device_buf
(
sizeof
(
WeiDataType
)
*
wei_k_c_y_x
.
mDesc
.
GetElementSpace
());
DeviceMem
out_device_buf
(
sizeof
(
OutDataType
)
*
out_n_k_ho_wo_device_result
.
mDesc
.
GetElementSpace
());
in_device_buf
.
ToDevice
(
in_n_c_hi_wi
.
mData
.
data
());
wei_device_buf
.
ToDevice
(
wei_k_c_y_x
.
mData
.
data
());
using
PassThrough
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
DeviceConvFwdNoOpPtr
=
ck
::
tensor_operation
::
device
::
DeviceConvFwdPtr
<
PassThrough
,
PassThrough
,
PassThrough
>
;
// add device Conv instances
std
::
vector
<
DeviceConvFwdNoOpPtr
>
conv_ptrs
;
if
constexpr
(
ck
::
is_same_v
<
ck
::
remove_cv_t
<
InDataType
>
,
float
>
&&
ck
::
is_same_v
<
ck
::
remove_cv_t
<
WeiDataType
>
,
float
>
&&
ck
::
is_same_v
<
ck
::
remove_cv_t
<
OutDataType
>
,
float
>
)
{
ck
::
tensor_operation
::
device
::
device_conv2d_fwd_instance
::
add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f32_instances
(
conv_ptrs
);
}
else
if
constexpr
(
ck
::
is_same_v
<
ck
::
remove_cv_t
<
InDataType
>
,
ck
::
half_t
>
&&
ck
::
is_same_v
<
ck
::
remove_cv_t
<
WeiDataType
>
,
ck
::
half_t
>
&&
ck
::
is_same_v
<
ck
::
remove_cv_t
<
OutDataType
>
,
ck
::
half_t
>
)
{
ck
::
tensor_operation
::
device
::
device_conv2d_fwd_instance
::
add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f16_instances
(
conv_ptrs
);
ck
::
tensor_operation
::
device
::
device_conv2d_fwd_instance
::
add_device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk_f16_instances
(
conv_ptrs
);
}
else
if
constexpr
(
ck
::
is_same_v
<
ck
::
remove_cv_t
<
InDataType
>
,
ck
::
bhalf_t
>
&&
ck
::
is_same_v
<
ck
::
remove_cv_t
<
WeiDataType
>
,
ck
::
bhalf_t
>
&&
ck
::
is_same_v
<
ck
::
remove_cv_t
<
OutDataType
>
,
ck
::
bhalf_t
>
)
{
ck
::
tensor_operation
::
device
::
device_conv2d_fwd_instance
::
add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_bf16_instances
(
conv_ptrs
);
}
else
if
constexpr
(
ck
::
is_same_v
<
ck
::
remove_cv_t
<
InDataType
>
,
int8_t
>
&&
ck
::
is_same_v
<
ck
::
remove_cv_t
<
WeiDataType
>
,
int8_t
>
&&
ck
::
is_same_v
<
ck
::
remove_cv_t
<
OutDataType
>
,
int8_t
>
)
{
ck
::
tensor_operation
::
device
::
device_conv2d_fwd_instance
::
add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_int8_instances
(
conv_ptrs
);
}
if
(
conv_ptrs
.
size
()
<=
0
)
{
throw
std
::
runtime_error
(
"wrong! no device Conv instance found"
);
}
auto
ref_conv
=
ReferenceConvFwdInstance
{};
auto
ref_invoker
=
ref_conv
.
MakeInvoker
();
auto
ref_argument
=
ref_conv
.
MakeArgument
(
in_n_c_hi_wi
,
wei_k_c_y_x
,
out_n_k_ho_wo_host_result
,
conv_filter_strides
,
conv_filter_dilations
,
input_left_pads
,
input_right_pads
,
InElementOp
{},
WeiElementOp
{},
OutElementOp
{});
ref_invoker
.
Run
(
ref_argument
);
// profile device Conv instances
bool
success
=
false
;
for
(
auto
&
conv_ptr
:
conv_ptrs
)
{
auto
argument_ptr
=
conv_ptr
->
MakeArgumentPointer
(
static_cast
<
InDataType
*>
(
in_device_buf
.
GetDeviceBuffer
()),
static_cast
<
WeiDataType
*>
(
wei_device_buf
.
GetDeviceBuffer
()),
static_cast
<
OutDataType
*>
(
out_device_buf
.
GetDeviceBuffer
()),
N
,
K
,
C
,
input_spatial_lengths
,
filter_spatial_lengths
,
output_spatial_lengths
,
conv_filter_strides
,
conv_filter_dilations
,
input_left_pads
,
input_right_pads
,
PassThrough
{},
PassThrough
{},
PassThrough
{});
auto
invoker_ptr
=
conv_ptr
->
MakeInvokerPointer
();
if
(
conv_ptr
->
IsSupportedArgument
(
argument_ptr
.
get
()))
{
invoker_ptr
->
Run
(
argument_ptr
.
get
(),
0
);
out_device_buf
.
FromDevice
(
out_n_k_ho_wo_device_result
.
mData
.
data
());
if
(
!
check_out
(
out_n_k_ho_wo_host_result
,
out_n_k_ho_wo_device_result
))
{
success
=
false
;
break
;
}
success
=
true
;
}
}
if
(
success
)
{
std
::
cout
<<
"test conv2d fwd : Pass"
<<
std
::
endl
;
return
0
;
}
else
{
std
::
cout
<<
"test conv2d fwd: Fail "
<<
std
::
endl
;
return
-
1
;
}
};
int
res
=
-
1
;
if
(
data_type
==
0
)
{
res
=
Run
(
float
(),
float
(),
float
());
}
else
if
(
data_type
==
1
)
{
res
=
Run
(
ck
::
half_t
(),
ck
::
half_t
(),
ck
::
half_t
());
}
else
if
(
data_type
==
2
)
{
Run
(
ck
::
bhalf_t
(),
ck
::
bhalf_t
(),
ck
::
bhalf_t
());
}
else
if
(
data_type
==
3
)
{
res
=
Run
(
int8_t
(),
int8_t
(),
int8_t
());
}
return
res
;
}
test/conv_util/conv_util.cpp
View file @
e72c0c43
...
@@ -5,33 +5,10 @@
...
@@ -5,33 +5,10 @@
#include "config.hpp"
#include "config.hpp"
#include "conv_utils.hpp"
#include "conv_utils.hpp"
#include "tensor_layout.hpp"
#include "tensor_layout.hpp"
#include "test_util.hpp"
namespace
{
namespace
{
template
<
typename
T
>
bool
cmp_vec
(
const
std
::
vector
<
T
>&
out
,
const
std
::
vector
<
T
>&
ref
,
const
std
::
string
&
msg
)
{
if
(
out
.
size
()
!=
ref
.
size
())
{
std
::
cout
<<
"out.size() != ref.size(), :"
<<
out
.
size
()
<<
" != "
<<
ref
.
size
()
<<
std
::
endl
<<
msg
<<
std
::
endl
;
return
false
;
}
for
(
std
::
size_t
i
=
0
;
i
<
ref
.
size
();
++
i
)
{
if
(
out
[
i
]
!=
ref
[
i
])
{
std
::
cout
<<
"out["
<<
i
<<
"] != ref["
<<
i
<<
"]: "
<<
out
[
i
]
<<
"!="
<<
ref
[
i
]
<<
std
::
endl
<<
msg
<<
std
::
endl
;
return
false
;
}
}
return
true
;
}
bool
TestConvParams_GetOutputSpatialLengths
()
bool
TestConvParams_GetOutputSpatialLengths
()
{
{
bool
res
{
true
};
bool
res
{
true
};
...
@@ -43,26 +20,26 @@ bool TestConvParams_GetOutputSpatialLengths()
...
@@ -43,26 +20,26 @@ bool TestConvParams_GetOutputSpatialLengths()
// padding {{1,1}, {1,1}}
// padding {{1,1}, {1,1}}
ck
::
conv_util
::
ConvParams
conv_params
;
ck
::
conv_util
::
ConvParams
conv_params
;
std
::
vector
<
ck
::
index_t
>
out_spatial_len
=
conv_params
.
GetOutputSpatialLengths
();
std
::
vector
<
ck
::
index_t
>
out_spatial_len
=
conv_params
.
GetOutputSpatialLengths
();
res
=
cmp_vec
(
out_spatial_len
,
res
=
test
::
check_err
(
out_spatial_len
,
std
::
vector
<
ck
::
index_t
>
{
36
,
36
},
std
::
vector
<
ck
::
index_t
>
{
36
,
36
},
"Error: ConvParams 2D default constructor."
);
"Error: ConvParams 2D default constructor."
);
conv_params
.
conv_filter_strides
=
std
::
vector
<
ck
::
index_t
>
{
1
,
1
};
conv_params
.
conv_filter_strides
=
std
::
vector
<
ck
::
index_t
>
{
1
,
1
};
out_spatial_len
=
conv_params
.
GetOutputSpatialLengths
();
out_spatial_len
=
conv_params
.
GetOutputSpatialLengths
();
res
=
cmp_vec
(
res
=
test
::
check_err
(
out_spatial_len
,
std
::
vector
<
ck
::
index_t
>
{
71
,
71
},
"Error: ConvParams 2D stride {1,1}."
);
out_spatial_len
,
std
::
vector
<
ck
::
index_t
>
{
71
,
71
},
"Error: ConvParams 2D stride {1,1}."
);
conv_params
.
conv_filter_strides
=
std
::
vector
<
ck
::
index_t
>
{
2
,
2
};
conv_params
.
conv_filter_strides
=
std
::
vector
<
ck
::
index_t
>
{
2
,
2
};
conv_params
.
input_left_pads
=
std
::
vector
<
ck
::
index_t
>
{
2
,
2
};
conv_params
.
input_left_pads
=
std
::
vector
<
ck
::
index_t
>
{
2
,
2
};
conv_params
.
input_right_pads
=
std
::
vector
<
ck
::
index_t
>
{
2
,
2
};
conv_params
.
input_right_pads
=
std
::
vector
<
ck
::
index_t
>
{
2
,
2
};
out_spatial_len
=
conv_params
.
GetOutputSpatialLengths
();
out_spatial_len
=
conv_params
.
GetOutputSpatialLengths
();
res
=
cmp_vec
(
out_spatial_len
,
res
=
test
::
check_err
(
out_spatial_len
,
std
::
vector
<
ck
::
index_t
>
{
37
,
37
},
std
::
vector
<
ck
::
index_t
>
{
37
,
37
},
"Error: ConvParams 2D padding left/right {2,2}."
);
"Error: ConvParams 2D padding left/right {2,2}."
);
conv_params
.
conv_filter_dilations
=
std
::
vector
<
ck
::
index_t
>
{
2
,
2
};
conv_params
.
conv_filter_dilations
=
std
::
vector
<
ck
::
index_t
>
{
2
,
2
};
out_spatial_len
=
conv_params
.
GetOutputSpatialLengths
();
out_spatial_len
=
conv_params
.
GetOutputSpatialLengths
();
res
=
cmp_vec
(
res
=
test
::
check_err
(
out_spatial_len
,
std
::
vector
<
ck
::
index_t
>
{
36
,
36
},
"Error: ConvParams 2D dilation {2,2}."
);
out_spatial_len
,
std
::
vector
<
ck
::
index_t
>
{
36
,
36
},
"Error: ConvParams 2D dilation {2,2}."
);
conv_params
.
conv_filter_strides
=
std
::
vector
<
ck
::
index_t
>
{
3
,
3
};
conv_params
.
conv_filter_strides
=
std
::
vector
<
ck
::
index_t
>
{
3
,
3
};
...
@@ -70,9 +47,9 @@ bool TestConvParams_GetOutputSpatialLengths()
...
@@ -70,9 +47,9 @@ bool TestConvParams_GetOutputSpatialLengths()
conv_params
.
input_right_pads
=
std
::
vector
<
ck
::
index_t
>
{
1
,
1
};
conv_params
.
input_right_pads
=
std
::
vector
<
ck
::
index_t
>
{
1
,
1
};
conv_params
.
conv_filter_dilations
=
std
::
vector
<
ck
::
index_t
>
{
2
,
2
};
conv_params
.
conv_filter_dilations
=
std
::
vector
<
ck
::
index_t
>
{
2
,
2
};
out_spatial_len
=
conv_params
.
GetOutputSpatialLengths
();
out_spatial_len
=
conv_params
.
GetOutputSpatialLengths
();
res
=
cmp_vec
(
out_spatial_len
,
res
=
test
::
check_err
(
out_spatial_len
,
std
::
vector
<
ck
::
index_t
>
{
23
,
23
},
std
::
vector
<
ck
::
index_t
>
{
23
,
23
},
"Error: ConvParams 2D strides{3,3}, padding {1,1}, dilations {2,2}."
);
"Error: ConvParams 2D strides{3,3}, padding {1,1}, dilations {2,2}."
);
// -------------------------- 1D ------------------------------------
// -------------------------- 1D ------------------------------------
conv_params
.
num_dim_spatial
=
1
;
conv_params
.
num_dim_spatial
=
1
;
...
@@ -84,25 +61,24 @@ bool TestConvParams_GetOutputSpatialLengths()
...
@@ -84,25 +61,24 @@ bool TestConvParams_GetOutputSpatialLengths()
conv_params
.
input_right_pads
=
std
::
vector
<
ck
::
index_t
>
{
1
};
conv_params
.
input_right_pads
=
std
::
vector
<
ck
::
index_t
>
{
1
};
out_spatial_len
=
conv_params
.
GetOutputSpatialLengths
();
out_spatial_len
=
conv_params
.
GetOutputSpatialLengths
();
res
=
cmp_vec
(
res
=
test
::
check_err
(
out_spatial_len
,
std
::
vector
<
ck
::
index_t
>
{
36
},
"Error: ConvParams 1D."
);
out_spatial_len
,
std
::
vector
<
ck
::
index_t
>
{
36
},
"Error: ConvParams 1D default constructor."
);
conv_params
.
conv_filter_strides
=
std
::
vector
<
ck
::
index_t
>
{
1
,
1
};
conv_params
.
conv_filter_strides
=
std
::
vector
<
ck
::
index_t
>
{
1
,
1
};
out_spatial_len
=
conv_params
.
GetOutputSpatialLengths
();
out_spatial_len
=
conv_params
.
GetOutputSpatialLengths
();
res
=
res
=
test
::
check_err
(
cmp_vec
(
out_spatial_len
,
std
::
vector
<
ck
::
index_t
>
{
71
},
"Error: ConvParams 1D stride {1}."
);
out_spatial_len
,
std
::
vector
<
ck
::
index_t
>
{
71
},
"Error: ConvParams 1D stride {1}."
);
conv_params
.
conv_filter_strides
=
std
::
vector
<
ck
::
index_t
>
{
2
};
conv_params
.
conv_filter_strides
=
std
::
vector
<
ck
::
index_t
>
{
2
};
conv_params
.
input_left_pads
=
std
::
vector
<
ck
::
index_t
>
{
2
};
conv_params
.
input_left_pads
=
std
::
vector
<
ck
::
index_t
>
{
2
};
conv_params
.
input_right_pads
=
std
::
vector
<
ck
::
index_t
>
{
2
};
conv_params
.
input_right_pads
=
std
::
vector
<
ck
::
index_t
>
{
2
};
out_spatial_len
=
conv_params
.
GetOutputSpatialLengths
();
out_spatial_len
=
conv_params
.
GetOutputSpatialLengths
();
res
=
cmp_vec
(
out_spatial_len
,
res
=
test
::
check_err
(
out_spatial_len
,
std
::
vector
<
ck
::
index_t
>
{
37
},
std
::
vector
<
ck
::
index_t
>
{
37
},
"Error: ConvParams 1D padding left/right {2}."
);
"Error: ConvParams 1D padding left/right {2}."
);
conv_params
.
conv_filter_dilations
=
std
::
vector
<
ck
::
index_t
>
{
2
};
conv_params
.
conv_filter_dilations
=
std
::
vector
<
ck
::
index_t
>
{
2
};
out_spatial_len
=
conv_params
.
GetOutputSpatialLengths
();
out_spatial_len
=
conv_params
.
GetOutputSpatialLengths
();
res
=
cmp_vec
(
res
=
test
::
check_err
(
out_spatial_len
,
std
::
vector
<
ck
::
index_t
>
{
36
},
"Error: ConvParams 1D dilation {2}."
);
out_spatial_len
,
std
::
vector
<
ck
::
index_t
>
{
36
},
"Error: ConvParams 1D dilation {2}."
);
conv_params
.
conv_filter_strides
=
std
::
vector
<
ck
::
index_t
>
{
3
};
conv_params
.
conv_filter_strides
=
std
::
vector
<
ck
::
index_t
>
{
3
};
...
@@ -110,9 +86,52 @@ bool TestConvParams_GetOutputSpatialLengths()
...
@@ -110,9 +86,52 @@ bool TestConvParams_GetOutputSpatialLengths()
conv_params
.
input_right_pads
=
std
::
vector
<
ck
::
index_t
>
{
1
};
conv_params
.
input_right_pads
=
std
::
vector
<
ck
::
index_t
>
{
1
};
conv_params
.
conv_filter_dilations
=
std
::
vector
<
ck
::
index_t
>
{
2
};
conv_params
.
conv_filter_dilations
=
std
::
vector
<
ck
::
index_t
>
{
2
};
out_spatial_len
=
conv_params
.
GetOutputSpatialLengths
();
out_spatial_len
=
conv_params
.
GetOutputSpatialLengths
();
res
=
cmp_vec
(
out_spatial_len
,
res
=
test
::
check_err
(
out_spatial_len
,
std
::
vector
<
ck
::
index_t
>
{
23
},
std
::
vector
<
ck
::
index_t
>
{
23
},
"Error: ConvParams 1D strides{3}, padding {1}, dilations {2}."
);
"Error: ConvParams 1D strides{3}, padding {1}, dilations {2}."
);
// -------------------------- 3D ------------------------------------
conv_params
.
num_dim_spatial
=
3
;
conv_params
.
filter_spatial_lengths
=
std
::
vector
<
ck
::
index_t
>
{
3
,
3
,
3
};
conv_params
.
input_spatial_lengths
=
std
::
vector
<
ck
::
index_t
>
{
71
,
71
,
71
};
conv_params
.
conv_filter_strides
=
std
::
vector
<
ck
::
index_t
>
{
2
,
2
,
2
};
conv_params
.
conv_filter_dilations
=
std
::
vector
<
ck
::
index_t
>
{
1
,
1
,
1
};
conv_params
.
input_left_pads
=
std
::
vector
<
ck
::
index_t
>
{
1
,
1
,
1
};
conv_params
.
input_right_pads
=
std
::
vector
<
ck
::
index_t
>
{
1
,
1
,
1
};
out_spatial_len
=
conv_params
.
GetOutputSpatialLengths
();
res
=
test
::
check_err
(
out_spatial_len
,
std
::
vector
<
ck
::
index_t
>
{
36
,
36
,
36
},
"Error: ConvParams 3D."
);
conv_params
.
conv_filter_strides
=
std
::
vector
<
ck
::
index_t
>
{
1
,
1
,
1
};
out_spatial_len
=
conv_params
.
GetOutputSpatialLengths
();
res
=
test
::
check_err
(
out_spatial_len
,
std
::
vector
<
ck
::
index_t
>
{
71
,
71
,
71
},
"Error: ConvParams 3D stride {1, 1, 1}."
);
conv_params
.
conv_filter_strides
=
std
::
vector
<
ck
::
index_t
>
{
2
,
2
,
2
};
conv_params
.
input_left_pads
=
std
::
vector
<
ck
::
index_t
>
{
2
,
2
,
2
};
conv_params
.
input_right_pads
=
std
::
vector
<
ck
::
index_t
>
{
2
,
2
,
2
};
out_spatial_len
=
conv_params
.
GetOutputSpatialLengths
();
res
=
test
::
check_err
(
out_spatial_len
,
std
::
vector
<
ck
::
index_t
>
{
37
,
37
,
37
},
"Error: ConvParams 3D padding left/right {2, 2, 2}."
);
conv_params
.
conv_filter_dilations
=
std
::
vector
<
ck
::
index_t
>
{
2
,
2
,
2
};
out_spatial_len
=
conv_params
.
GetOutputSpatialLengths
();
res
=
test
::
check_err
(
out_spatial_len
,
std
::
vector
<
ck
::
index_t
>
{
36
,
36
,
36
},
"Error: ConvParams 3D dilation {2, 2, 2}."
);
conv_params
.
conv_filter_strides
=
std
::
vector
<
ck
::
index_t
>
{
3
,
3
,
3
};
conv_params
.
input_left_pads
=
std
::
vector
<
ck
::
index_t
>
{
1
,
1
,
1
};
conv_params
.
input_right_pads
=
std
::
vector
<
ck
::
index_t
>
{
1
,
1
,
1
};
conv_params
.
conv_filter_dilations
=
std
::
vector
<
ck
::
index_t
>
{
2
,
2
,
2
};
out_spatial_len
=
conv_params
.
GetOutputSpatialLengths
();
res
=
test
::
check_err
(
out_spatial_len
,
std
::
vector
<
ck
::
index_t
>
{
23
,
23
,
23
},
"Error: ConvParams 3D strides{3, 3, 3}, padding {1, 1, 1}, dilations {2, 2, 2}."
);
return
res
;
return
res
;
}
}
...
@@ -123,23 +142,44 @@ bool TestGetHostTensorDescriptor()
...
@@ -123,23 +142,44 @@ bool TestGetHostTensorDescriptor()
namespace
tl
=
ck
::
tensor_layout
::
convolution
;
namespace
tl
=
ck
::
tensor_layout
::
convolution
;
std
::
vector
<
std
::
size_t
>
dims
{
2
,
3
,
4
,
5
};
std
::
vector
<
std
::
size_t
>
dims
{
2
,
3
,
4
,
5
};
HostTensorDescriptor
h
=
ck
::
conv_util
::
GetHostTensorDescriptor
(
dims
,
tl
::
NHWC
{});
HostTensorDescriptor
h
=
ck
::
conv_util
::
GetHostTensorDescriptor
(
dims
,
tl
::
NHWC
{});
res
=
cmp_vec
(
h
.
GetLengths
(),
{
2
,
3
,
4
,
5
},
"Error: wrong NHWC dimensions lengths!"
);
res
=
test
::
check_err
(
h
.
GetLengths
(),
{
2
,
3
,
4
,
5
},
"Error: wrong NHWC dimensions lengths!"
);
res
=
res
=
test
::
check_err
(
cmp_vec
(
h
.
GetStrides
(),
{
3
*
4
*
5
,
1
,
3
*
5
,
3
},
"Error: wrong NHWC dimensions strides!"
);
h
.
GetStrides
(),
{
3
*
4
*
5
,
1
,
3
*
5
,
3
},
"Error: wrong NHWC dimensions strides!"
);
h
=
ck
::
conv_util
::
GetHostTensorDescriptor
(
dims
,
tl
::
NCHW
{});
h
=
ck
::
conv_util
::
GetHostTensorDescriptor
(
dims
,
tl
::
NCHW
{});
res
=
cmp_vec
(
h
.
GetLengths
(),
{
2
,
3
,
4
,
5
},
"Error: wrong NCHW dimensions lengths!"
);
res
=
test
::
check_err
(
h
.
GetLengths
(),
{
2
,
3
,
4
,
5
},
"Error: wrong NCHW dimensions lengths!"
);
res
=
res
=
test
::
check_err
(
cmp_vec
(
h
.
GetStrides
(),
{
3
*
4
*
5
,
4
*
5
,
5
,
1
},
"Error: wrong NCHW dimensions strides!"
);
h
.
GetStrides
(),
{
3
*
4
*
5
,
4
*
5
,
5
,
1
},
"Error: wrong NCHW dimensions strides!"
);
dims
=
std
::
vector
<
std
::
size_t
>
{
2
,
3
,
4
};
dims
=
std
::
vector
<
std
::
size_t
>
{
2
,
3
,
4
};
h
=
ck
::
conv_util
::
GetHostTensorDescriptor
(
dims
,
tl
::
NWC
{});
h
=
ck
::
conv_util
::
GetHostTensorDescriptor
(
dims
,
tl
::
NWC
{});
res
=
cmp_vec
(
h
.
GetLengths
(),
{
2
,
3
,
4
},
"Error: wrong NWC dimensions lengths!"
);
res
=
test
::
check_err
(
h
.
GetLengths
(),
{
2
,
3
,
4
},
"Error: wrong NWC dimensions lengths!"
);
res
=
cmp_vec
(
h
.
GetStrides
(),
{
3
*
4
,
1
,
3
},
"Error: wrong NWC dimensions strides!"
);
res
=
test
::
check_err
(
h
.
GetStrides
(),
{
3
*
4
,
1
,
3
},
"Error: wrong NWC dimensions strides!"
);
h
=
ck
::
conv_util
::
GetHostTensorDescriptor
(
dims
,
tl
::
NCW
{});
h
=
ck
::
conv_util
::
GetHostTensorDescriptor
(
dims
,
tl
::
NCW
{});
res
=
cmp_vec
(
h
.
GetLengths
(),
{
2
,
3
,
4
},
"Error: wrong NCW dimensions lengths!"
);
res
=
test
::
check_err
(
h
.
GetLengths
(),
{
2
,
3
,
4
},
"Error: wrong NCW dimensions lengths!"
);
res
=
cmp_vec
(
h
.
GetStrides
(),
{
3
*
4
,
4
,
1
},
"Error: wrong NCW dimensions strides!"
);
res
=
test
::
check_err
(
h
.
GetStrides
(),
{
3
*
4
,
4
,
1
},
"Error: wrong NCW dimensions strides!"
);
dims
=
std
::
vector
<
std
::
size_t
>
{
2
,
3
,
4
,
5
,
6
};
h
=
ck
::
conv_util
::
GetHostTensorDescriptor
(
dims
,
tl
::
NDHWC
{});
res
=
test
::
check_err
(
h
.
GetLengths
(),
dims
,
"Error: wrong NDHWC dimensions lengths!"
);
res
=
test
::
check_err
(
h
.
GetStrides
(),
{
3
*
4
*
5
*
6
,
// N
1
,
// C
3
*
5
*
6
,
// D
3
*
6
,
// H
3
},
// W
"Error: wrong NDHWC dimensions strides!"
);
h
=
ck
::
conv_util
::
GetHostTensorDescriptor
(
dims
,
tl
::
NCDHW
{});
res
=
test
::
check_err
(
h
.
GetLengths
(),
dims
,
"Error: wrong NCDHW dimensions lengths!"
);
res
=
test
::
check_err
(
h
.
GetStrides
(),
{
3
*
4
*
5
*
6
,
// N
4
*
5
*
6
,
// C
5
*
6
,
// D
6
,
// H
1
},
// W
"Error: wrong NCDHW dimensions strides!"
);
return
res
;
return
res
;
}
}
...
...
test/convnd_fwd/CMakeLists.txt
View file @
e72c0c43
add_test_executable
(
test_convnd_fwd convnd_fwd.cpp
)
add_custom_target
(
test_convnd_fwd
)
target_link_libraries
(
test_convnd_fwd PRIVATE host_tensor
)
add_test_executable
(
test_conv1d_fwd conv1d_fwd.cpp
)
target_link_libraries
(
test_conv1d_fwd PRIVATE host_tensor
)
target_link_libraries
(
test_conv1d_fwd PRIVATE device_conv1d_fwd_instance
)
add_dependencies
(
test_convnd_fwd test_conv1d_fwd
)
add_test_executable
(
test_conv2d_fwd conv2d_fwd.cpp
)
target_link_libraries
(
test_conv2d_fwd PRIVATE host_tensor
)
target_link_libraries
(
test_conv2d_fwd PRIVATE device_conv2d_fwd_instance
)
add_dependencies
(
test_convnd_fwd test_conv2d_fwd
)
add_test_executable
(
test_conv3d_fwd conv3d_fwd.cpp
)
target_link_libraries
(
test_conv3d_fwd PRIVATE host_tensor
)
target_link_libraries
(
test_conv3d_fwd PRIVATE device_conv3d_fwd_instance
)
add_dependencies
(
test_convnd_fwd test_conv3d_fwd
)
Prev
1
…
8
9
10
11
12
13
14
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment