Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
dd6a8de4
Commit
dd6a8de4
authored
Apr 06, 2022
by
Jehandad Khan
Browse files
Merge branch 'develop' into jd/dev_pkg
parents
0aa899aa
abf4bdb9
Changes
470
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
1286 additions
and
412 deletions
+1286
-412
profiler/src/profile_gemm_reduce.cpp
profiler/src/profile_gemm_reduce.cpp
+146
-0
profiler/src/profile_grouped_gemm.cpp
profiler/src/profile_grouped_gemm.cpp
+157
-0
profiler/src/profile_reduce.cpp
profiler/src/profile_reduce.cpp
+121
-43
profiler/src/profiler.cpp
profiler/src/profiler.cpp
+40
-8
script/cmake-rocm.sh
script/cmake-rocm.sh
+4
-2
script/count_vgpr.sh
script/count_vgpr.sh
+17
-256
script/profile_reduce_no_index.sh
script/profile_reduce_no_index.sh
+64
-47
script/profile_reduce_with_index.sh
script/profile_reduce_with_index.sh
+53
-42
script/test_convnd_fwd.sh
script/test_convnd_fwd.sh
+110
-0
script/test_reduce_no_index.sh
script/test_reduce_no_index.sh
+52
-0
script/test_reduce_with_index.sh
script/test_reduce_with_index.sh
+52
-0
test/CMakeLists.txt
test/CMakeLists.txt
+8
-2
test/batched_gemm/CMakeLists.txt
test/batched_gemm/CMakeLists.txt
+4
-0
test/batched_gemm/batched_gemm_fp16.cpp
test/batched_gemm/batched_gemm_fp16.cpp
+41
-0
test/batched_gemm/batched_gemm_util.hpp
test/batched_gemm/batched_gemm_util.hpp
+106
-0
test/batched_gemm_reduce/CMakeLists.txt
test/batched_gemm_reduce/CMakeLists.txt
+9
-0
test/batched_gemm_reduce/batched_gemm_reduce_fp16.cpp
test/batched_gemm_reduce/batched_gemm_reduce_fp16.cpp
+64
-0
test/conv2d_bwd_data/conv2d_bwd_data.cpp
test/conv2d_bwd_data/conv2d_bwd_data.cpp
+14
-12
test/conv2d_bwd_weight/CMakeLists.txt
test/conv2d_bwd_weight/CMakeLists.txt
+8
-0
test/conv2d_bwd_weight/conv2d_bwd_weight.cpp
test/conv2d_bwd_weight/conv2d_bwd_weight.cpp
+216
-0
No files found.
profiler/src/profile_gemm_reduce.cpp
0 → 100644
View file @
dd6a8de4
#include <iostream>
#include <numeric>
#include <initializer_list>
#include <cstdlib>
#include <stdlib.h>
#include <half.hpp>
#include "profile_gemm_reduce_impl.hpp"
int
profile_gemm_reduce
(
int
argc
,
char
*
argv
[])
{
enum
struct
GemmMatrixLayout
{
MK_KN_MN
,
// 0
MK_NK_MN
,
// 1
KM_KN_MN
,
// 2
KM_NK_MN
,
// 3
};
enum
struct
GemmReduceDataType
{
F32_F32_F32_F32_F32
,
// 0
F16_F16_F16_F32_F32
,
// 1
};
if
(
!
(
argc
==
14
||
argc
==
15
))
{
printf
(
"arg1: tensor operation (gemm: GEMM+Reduce)
\n
"
);
printf
(
"arg2: data type (0: fp32; 1: fp16)
\n
"
);
printf
(
"arg3: matrix layout (0: A[m, k] * B[k, n] = C[m, n];
\n
"
);
printf
(
" 1: A[m, k] * B[n, k] = C[m, n];
\n
"
);
printf
(
" 2: A[k, m] * B[k, n] = C[m, n];
\n
"
);
printf
(
" 3: A[k, m] * B[n, k] = C[m, n])
\n
"
);
printf
(
"arg4: verification (0: no; 1: yes)
\n
"
);
printf
(
"arg5: initialization (0: no init; 1: integer value; 2: decimal value)
\n
"
);
printf
(
"arg8: print tensor value (0: no; 1: yes)
\n
"
);
printf
(
"arg7: run kernel # of times (>1)
\n
"
);
printf
(
"arg8 to 13: M, N, K, StrideA, StrideB, StrideC
\n
"
);
printf
(
"arg14: split k into mulitiple batch
\n
"
);
exit
(
1
);
}
const
auto
data_type
=
static_cast
<
GemmReduceDataType
>
(
std
::
stoi
(
argv
[
2
]));
const
auto
layout
=
static_cast
<
GemmMatrixLayout
>
(
std
::
stoi
(
argv
[
3
]));
const
bool
do_verification
=
std
::
stoi
(
argv
[
4
]);
const
int
init_method
=
std
::
stoi
(
argv
[
5
]);
const
bool
do_log
=
std
::
stoi
(
argv
[
6
]);
const
int
nrepeat
=
std
::
stoi
(
argv
[
7
]);
const
int
M
=
std
::
stoi
(
argv
[
8
]);
const
int
N
=
std
::
stoi
(
argv
[
9
]);
const
int
K
=
std
::
stoi
(
argv
[
10
]);
const
int
StrideA
=
std
::
stoi
(
argv
[
11
]);
const
int
StrideB
=
std
::
stoi
(
argv
[
12
]);
const
int
StrideC
=
std
::
stoi
(
argv
[
13
]);
if
(
data_type
==
GemmReduceDataType
::
F16_F16_F16_F32_F32
&&
layout
==
GemmMatrixLayout
::
MK_KN_MN
)
{
ck
::
profiler
::
profile_gemm_reduce_impl
<
ck
::
half_t
,
ck
::
half_t
,
ck
::
half_t
,
float
,
ck
::
tensor_layout
::
gemm
::
RowMajor
,
ck
::
tensor_layout
::
gemm
::
RowMajor
,
ck
::
tensor_layout
::
gemm
::
RowMajor
>
(
do_verification
,
init_method
,
do_log
,
nrepeat
,
M
,
N
,
K
,
(
StrideA
<
0
)
?
K
:
StrideA
,
(
StrideB
<
0
)
?
N
:
StrideB
,
(
StrideC
<
0
)
?
N
:
StrideC
);
}
else
if
(
data_type
==
GemmReduceDataType
::
F16_F16_F16_F32_F32
&&
layout
==
GemmMatrixLayout
::
MK_NK_MN
)
{
ck
::
profiler
::
profile_gemm_reduce_impl
<
ck
::
half_t
,
ck
::
half_t
,
ck
::
half_t
,
float
,
ck
::
tensor_layout
::
gemm
::
RowMajor
,
ck
::
tensor_layout
::
gemm
::
ColumnMajor
,
ck
::
tensor_layout
::
gemm
::
RowMajor
>
(
do_verification
,
init_method
,
do_log
,
nrepeat
,
M
,
N
,
K
,
(
StrideA
<
0
)
?
K
:
StrideA
,
(
StrideB
<
0
)
?
K
:
StrideB
,
(
StrideC
<
0
)
?
N
:
StrideC
);
}
else
if
(
data_type
==
GemmReduceDataType
::
F16_F16_F16_F32_F32
&&
layout
==
GemmMatrixLayout
::
KM_KN_MN
)
{
ck
::
profiler
::
profile_gemm_reduce_impl
<
ck
::
half_t
,
ck
::
half_t
,
ck
::
half_t
,
float
,
ck
::
tensor_layout
::
gemm
::
ColumnMajor
,
ck
::
tensor_layout
::
gemm
::
RowMajor
,
ck
::
tensor_layout
::
gemm
::
RowMajor
>
(
do_verification
,
init_method
,
do_log
,
nrepeat
,
M
,
N
,
K
,
(
StrideA
<
0
)
?
M
:
StrideA
,
(
StrideB
<
0
)
?
N
:
StrideB
,
(
StrideC
<
0
)
?
N
:
StrideC
);
}
else
if
(
data_type
==
GemmReduceDataType
::
F16_F16_F16_F32_F32
&&
layout
==
GemmMatrixLayout
::
KM_NK_MN
)
{
ck
::
profiler
::
profile_gemm_reduce_impl
<
ck
::
half_t
,
ck
::
half_t
,
ck
::
half_t
,
float
,
ck
::
tensor_layout
::
gemm
::
ColumnMajor
,
ck
::
tensor_layout
::
gemm
::
ColumnMajor
,
ck
::
tensor_layout
::
gemm
::
RowMajor
>
(
do_verification
,
init_method
,
do_log
,
nrepeat
,
M
,
N
,
K
,
(
StrideA
<
0
)
?
M
:
StrideA
,
(
StrideB
<
0
)
?
K
:
StrideB
,
(
StrideC
<
0
)
?
N
:
StrideC
);
}
else
{
throw
std
::
runtime_error
(
"wrong! this data_type & layout is not implemented"
);
}
return
1
;
}
profiler/src/profile_grouped_gemm.cpp
0 → 100644
View file @
dd6a8de4
#include <iostream>
#include <numeric>
#include <initializer_list>
#include <cstdlib>
#include <stdlib.h>
#include <half.hpp>
#include "profile_grouped_gemm_impl.hpp"
enum
struct
GemmMatrixLayout
{
MK_KN_MN
,
// 0
MK_NK_MN
,
// 1
KM_KN_MN
,
// 2
KM_NK_MN
,
// 3
MK_KN_NM
,
// 4
MK_NK_NM
,
// 5
KM_KN_NM
,
// 6
KM_NK_NM
,
// 7
};
enum
struct
GemmDataType
{
F32_F32_F32
,
// 0
F16_F16_F16
,
// 1
BF16_BF16_BF16
,
// 2
INT8_INT8_INT8
,
// 3
};
std
::
vector
<
int
>
argToIntArray
(
char
*
input
)
{
std
::
vector
<
int
>
out
;
std
::
istringstream
in
(
input
);
std
::
string
item
;
while
(
std
::
getline
(
in
,
item
,
','
))
{
out
.
push_back
(
std
::
stoi
(
item
));
}
return
out
;
}
int
profile_grouped_gemm
(
int
argc
,
char
*
argv
[])
{
if
(
!
(
argc
==
14
))
{
printf
(
"arg1: tensor operation (grouped_gemm: Grouped GEMM)
\n
"
);
printf
(
"arg2: data type (0: fp32; 1: fp16; 2: bf16; 3: int8)
\n
"
);
printf
(
"arg3: matrix layout (0: A[m, k] * B[k, n] = C[m, n];
\n
"
);
printf
(
" 1: A[m, k] * B[n, k] = C[m, n];
\n
"
);
printf
(
" 2: A[k, m] * B[k, n] = C[m, n];
\n
"
);
printf
(
" 3: A[k, m] * B[n, k] = C[m, n])
\n
"
);
printf
(
"arg4: verification (0: no; 1: yes)
\n
"
);
printf
(
"arg5: initialization (0: no init; 1: integer value; 2: decimal value)
\n
"
);
printf
(
"arg8: print tensor value (0: no; 1: yes)
\n
"
);
printf
(
"arg7: run kernel # of times (>1)
\n
"
);
printf
(
"arg8 to 13: Ms, Ns, Ks, StrideAs, StrideBs, StrideCs (e.g., 256,256 128,128 64,64 "
"64,64 64,64 128,128)
\n
"
);
exit
(
1
);
}
const
auto
data_type
=
static_cast
<
GemmDataType
>
(
std
::
stoi
(
argv
[
2
]));
const
auto
layout
=
static_cast
<
GemmMatrixLayout
>
(
std
::
stoi
(
argv
[
3
]));
const
bool
do_verification
=
std
::
stoi
(
argv
[
4
]);
const
int
init_method
=
std
::
stoi
(
argv
[
5
]);
const
bool
do_log
=
std
::
stoi
(
argv
[
6
]);
const
int
nrepeat
=
std
::
stoi
(
argv
[
7
]);
const
auto
Ms
=
argToIntArray
(
argv
[
8
]);
const
auto
Ns
=
argToIntArray
(
argv
[
9
]);
const
auto
Ks
=
argToIntArray
(
argv
[
10
]);
const
auto
StrideAs
=
argToIntArray
(
argv
[
11
]);
const
auto
StrideBs
=
argToIntArray
(
argv
[
12
]);
const
auto
StrideCs
=
argToIntArray
(
argv
[
13
]);
if
(
data_type
==
GemmDataType
::
F16_F16_F16
&&
layout
==
GemmMatrixLayout
::
MK_KN_MN
)
{
ck
::
profiler
::
profile_grouped_gemm_impl
<
ck
::
half_t
,
ck
::
half_t
,
ck
::
half_t
,
ck
::
tensor_layout
::
gemm
::
RowMajor
,
ck
::
tensor_layout
::
gemm
::
RowMajor
,
ck
::
tensor_layout
::
gemm
::
RowMajor
>
(
do_verification
,
init_method
,
do_log
,
nrepeat
,
Ms
,
Ns
,
Ks
,
StrideAs
,
StrideBs
,
StrideCs
);
}
else
if
(
data_type
==
GemmDataType
::
F16_F16_F16
&&
layout
==
GemmMatrixLayout
::
MK_NK_MN
)
{
ck
::
profiler
::
profile_grouped_gemm_impl
<
ck
::
half_t
,
ck
::
half_t
,
ck
::
half_t
,
ck
::
tensor_layout
::
gemm
::
RowMajor
,
ck
::
tensor_layout
::
gemm
::
ColumnMajor
,
ck
::
tensor_layout
::
gemm
::
RowMajor
>
(
do_verification
,
init_method
,
do_log
,
nrepeat
,
Ms
,
Ns
,
Ks
,
StrideAs
,
StrideBs
,
StrideCs
);
}
else
if
(
data_type
==
GemmDataType
::
F16_F16_F16
&&
layout
==
GemmMatrixLayout
::
KM_KN_MN
)
{
ck
::
profiler
::
profile_grouped_gemm_impl
<
ck
::
half_t
,
ck
::
half_t
,
ck
::
half_t
,
ck
::
tensor_layout
::
gemm
::
ColumnMajor
,
ck
::
tensor_layout
::
gemm
::
RowMajor
,
ck
::
tensor_layout
::
gemm
::
RowMajor
>
(
do_verification
,
init_method
,
do_log
,
nrepeat
,
Ms
,
Ns
,
Ks
,
StrideAs
,
StrideBs
,
StrideCs
);
}
else
if
(
data_type
==
GemmDataType
::
F16_F16_F16
&&
layout
==
GemmMatrixLayout
::
KM_NK_MN
)
{
ck
::
profiler
::
profile_grouped_gemm_impl
<
ck
::
half_t
,
ck
::
half_t
,
ck
::
half_t
,
ck
::
tensor_layout
::
gemm
::
ColumnMajor
,
ck
::
tensor_layout
::
gemm
::
ColumnMajor
,
ck
::
tensor_layout
::
gemm
::
RowMajor
>
(
do_verification
,
init_method
,
do_log
,
nrepeat
,
Ms
,
Ns
,
Ks
,
StrideAs
,
StrideBs
,
StrideCs
);
}
else
{
throw
std
::
runtime_error
(
"wrong! this GEMM data_type & layout is not implemented"
);
}
return
1
;
}
profiler/src/profile_reduce.cpp
View file @
dd6a8de4
...
...
@@ -20,12 +20,12 @@
using
namespace
std
;
using
ck
::
NanPropagation
_t
;
using
ck
::
ReduceTensorIndices
_t
;
using
ck
::
ReduceTensorOp
_t
;
using
ck
::
NanPropagation
;
using
ck
::
ReduceTensorIndices
;
using
ck
::
ReduceTensorOp
;
static
struct
option
long_options
[]
=
{{
"inLengths"
,
required_argument
,
nullptr
,
'D'
},
{
"
toR
educeDims"
,
required_argument
,
nullptr
,
'R'
},
{
"
r
educeDims"
,
required_argument
,
nullptr
,
'R'
},
{
"reduceOp"
,
required_argument
,
nullptr
,
'O'
},
{
"compType"
,
required_argument
,
nullptr
,
'C'
},
{
"outType"
,
required_argument
,
nullptr
,
'W'
},
...
...
@@ -34,6 +34,8 @@ static struct option long_options[] = {{"inLengths", required_argument, nullptr,
{
"scales"
,
required_argument
,
nullptr
,
'S'
},
{
"half"
,
no_argument
,
nullptr
,
'?'
},
{
"double"
,
no_argument
,
nullptr
,
'?'
},
{
"int8"
,
no_argument
,
nullptr
,
'?'
},
{
"bf16"
,
no_argument
,
nullptr
,
'?'
},
{
"dumpout"
,
required_argument
,
nullptr
,
'o'
},
{
"verify"
,
required_argument
,
nullptr
,
'v'
},
{
"log"
,
required_argument
,
nullptr
,
'l'
},
...
...
@@ -82,7 +84,7 @@ static std::vector<T> getTypeValuesFromString(const char* cstr_values)
return
(
values
);
}
typedef
enum
enum
struct
AppDataType
{
appHalf
=
0
,
appFloat
=
1
,
...
...
@@ -91,11 +93,11 @@ typedef enum
appInt8x4
=
4
,
appBFloat16
=
5
,
appDouble
=
6
,
}
appDataType_t
;
};
static
void
check_reduce_dims
(
const
int
rank
,
const
std
::
vector
<
int
>&
toR
educeDims
)
static
void
check_reduce_dims
(
const
int
rank
,
const
std
::
vector
<
int
>&
r
educeDims
)
{
for
(
auto
dim
:
toR
educeDims
)
for
(
auto
dim
:
r
educeDims
)
{
if
(
dim
<
0
||
dim
>=
rank
)
throw
std
::
runtime_error
(
"Invalid dimension index specified for Reducing"
);
...
...
@@ -103,7 +105,7 @@ static void check_reduce_dims(const int rank, const std::vector<int>& toReduceDi
unsigned
int
flag
=
0
;
for
(
auto
dim
:
toR
educeDims
)
for
(
auto
dim
:
r
educeDims
)
{
if
(
flag
&
(
0x1
<<
dim
))
throw
std
::
runtime_error
(
"All toReduce dimensions should be different!"
);
...
...
@@ -119,25 +121,27 @@ class AppArgs
public:
bool
use_half
=
false
;
bool
use_double
=
false
;
bool
use_int8
=
false
;
bool
use_bf16
=
false
;
std
::
vector
<
size_t
>
inLengths
;
std
::
vector
<
size_t
>
outLengths
;
std
::
vector
<
int
>
toR
educeDims
;
std
::
vector
<
int
>
r
educeDims
;
std
::
vector
<
float
>
scales
;
ReduceTensorOp
_t
reduceOp
=
ReduceTensorOp
_t
::
ADD
;
a
ppDataType
_t
compTypeId
=
appFloat
;
a
ppDataType
_t
outTypeId
=
appFloat
;
ReduceTensorOp
reduceOp
=
ReduceTensorOp
::
ADD
;
A
ppDataType
compTypeId
=
AppDataType
::
appFloat
;
A
ppDataType
outTypeId
=
AppDataType
::
appFloat
;
bool
compType_assigned
=
false
;
bool
outType_assigned
=
false
;
NanPropagation
_t
nanOpt
=
NanPropagation
_t
::
NOT_PROPAGATE_NAN
;
ReduceTensorIndices
_t
indicesOpt
=
ReduceTensorIndices
_t
::
NO_INDICES
;
bool
do_log
=
false
;
bool
do_verification
=
false
;
bool
do_dumpout
=
false
;
NanPropagation
nanOpt
=
NanPropagation
::
NOT_PROPAGATE_NAN
;
ReduceTensorIndices
indicesOpt
=
ReduceTensorIndices
::
NO_INDICES
;
bool
do_log
=
false
;
bool
do_verification
=
false
;
bool
do_dumpout
=
false
;
int
init_method
;
int
nrepeat
;
...
...
@@ -152,7 +156,7 @@ class AppArgs
std
::
cout
<<
"Usage of "
<<
cmd
<<
std
::
endl
;
std
::
cout
<<
"--inLengths or -D, comma separated list of input tensor dimension lengths"
<<
std
::
endl
;
std
::
cout
<<
"--
toR
educeDims or -R, comma separated list of to-reduce dimensions"
std
::
cout
<<
"--
r
educeDims or -R, comma separated list of to-reduce dimensions"
<<
std
::
endl
;
std
::
cout
<<
"--reduceOp or -O, enum value indicating the reduction operations"
<<
std
::
endl
;
...
...
@@ -169,6 +173,8 @@ class AppArgs
<<
std
::
endl
;
std
::
cout
<<
"--half, use fp16 for the input and output tensor data types"
<<
std
::
endl
;
std
::
cout
<<
"--double, use fp64 for the input and output tensor data types"
<<
std
::
endl
;
std
::
cout
<<
"--int8, use int8 for the input and output tensor data types"
<<
std
::
endl
;
std
::
cout
<<
"--bf16, use bfloat16 for the input and output tensor data types"
<<
std
::
endl
;
std
::
cout
<<
"--verify or -v, 1/0 to indicate whether to verify the reduction result by "
"comparing with the host-based reduction"
<<
std
::
endl
;
...
...
@@ -201,39 +207,39 @@ class AppArgs
if
(
!
optarg
)
throw
std
::
runtime_error
(
"Invalid option format!"
);
toR
educeDims
=
getTypeValuesFromString
<
int
>
(
optarg
);
r
educeDims
=
getTypeValuesFromString
<
int
>
(
optarg
);
break
;
case
'O'
:
if
(
!
optarg
)
throw
std
::
runtime_error
(
"Invalid option format!"
);
reduceOp
=
static_cast
<
ReduceTensorOp
_t
>
(
std
::
atoi
(
optarg
));
reduceOp
=
static_cast
<
ReduceTensorOp
>
(
std
::
atoi
(
optarg
));
break
;
case
'C'
:
if
(
!
optarg
)
throw
std
::
runtime_error
(
"Invalid option format!"
);
compTypeId
=
static_cast
<
a
ppDataType
_t
>
(
std
::
atoi
(
optarg
));
compTypeId
=
static_cast
<
A
ppDataType
>
(
std
::
atoi
(
optarg
));
compType_assigned
=
true
;
break
;
case
'W'
:
if
(
!
optarg
)
throw
std
::
runtime_error
(
"Invalid option format!"
);
outTypeId
=
static_cast
<
a
ppDataType
_t
>
(
std
::
atoi
(
optarg
));
outTypeId
=
static_cast
<
A
ppDataType
>
(
std
::
atoi
(
optarg
));
outType_assigned
=
true
;
break
;
case
'N'
:
if
(
!
optarg
)
throw
std
::
runtime_error
(
"Invalid option format!"
);
nanOpt
=
static_cast
<
NanPropagation
_t
>
(
std
::
atoi
(
optarg
));
nanOpt
=
static_cast
<
NanPropagation
>
(
std
::
atoi
(
optarg
));
break
;
case
'I'
:
if
(
!
optarg
)
throw
std
::
runtime_error
(
"Invalid option format!"
);
indicesOpt
=
static_cast
<
ReduceTensorIndices
_t
>
(
std
::
atoi
(
optarg
));
indicesOpt
=
static_cast
<
ReduceTensorIndices
>
(
std
::
atoi
(
optarg
));
break
;
case
'S'
:
if
(
!
optarg
)
...
...
@@ -267,6 +273,10 @@ class AppArgs
use_half
=
true
;
else
if
(
std
::
string
(
long_options
[
option_index
].
name
)
==
"double"
)
use_double
=
true
;
else
if
(
std
::
string
(
long_options
[
option_index
].
name
)
==
"int8"
)
use_int8
=
true
;
else
if
(
std
::
string
(
long_options
[
option_index
].
name
)
==
"bf16"
)
use_bf16
=
true
;
else
if
(
std
::
string
(
long_options
[
option_index
].
name
)
==
"help"
)
{
show_usage
(
argv
[
0
]);
...
...
@@ -293,10 +303,10 @@ class AppArgs
scales
.
push_back
(
0.0
f
);
};
if
(
reduceOp
==
ReduceTensorOp
_t
::
MIN
||
reduceOp
==
ReduceTensorOp
_t
::
MAX
||
reduceOp
==
ReduceTensorOp
_t
::
AMAX
)
if
(
reduceOp
==
ReduceTensorOp
::
MIN
||
reduceOp
==
ReduceTensorOp
::
MAX
||
reduceOp
==
ReduceTensorOp
::
AMAX
)
{
if
(
indicesOpt
!=
ReduceTensorIndices
_t
::
NO_INDICES
)
if
(
indicesOpt
!=
ReduceTensorIndices
::
NO_INDICES
)
need_indices
=
true
;
// for indexable operations, no need to assign compType and outType, just let them be
...
...
@@ -321,23 +331,24 @@ int profile_reduce(int argc, char* argv[])
int
rank
=
args
.
inLengths
.
size
();
check_reduce_dims
(
rank
,
args
.
toR
educeDims
);
check_reduce_dims
(
rank
,
args
.
r
educeDims
);
if
(
args
.
reduceOp
==
ReduceTensorOp
_t
::
MUL
||
args
.
reduceOp
==
ReduceTensorOp
_t
::
NORM1
)
if
(
args
.
reduceOp
==
ReduceTensorOp
::
MUL
||
args
.
reduceOp
==
ReduceTensorOp
::
NORM1
)
throw
std
::
runtime_error
(
"MUL and NORM1 are not supported by composable kernel!"
);
if
(
args
.
use_half
)
{
if
(
!
args
.
compType_assigned
)
args
.
compTypeId
=
appHalf
;
args
.
compTypeId
=
AppDataType
::
appHalf
;
if
(
args
.
outType_assigned
&&
(
args
.
outTypeId
!=
appHalf
&&
args
.
outTypeId
!=
appFloat
))
args
.
outTypeId
=
appFloat
;
if
(
args
.
outType_assigned
&&
(
args
.
outTypeId
!=
AppDataType
::
appHalf
&&
args
.
outTypeId
!=
AppDataType
::
appFloat
))
args
.
outTypeId
=
AppDataType
::
appFloat
;
if
(
!
args
.
outType_assigned
)
args
.
outTypeId
=
appHalf
;
args
.
outTypeId
=
AppDataType
::
appHalf
;
if
(
args
.
compTypeId
==
appHalf
)
if
(
args
.
compTypeId
==
AppDataType
::
appHalf
)
{
profile_reduce_impl
<
ck
::
half_t
,
ck
::
half_t
,
ck
::
half_t
>
(
args
.
do_verification
,
args
.
init_method
,
...
...
@@ -345,14 +356,14 @@ int profile_reduce(int argc, char* argv[])
args
.
do_dumpout
,
args
.
nrepeat
,
args
.
inLengths
,
args
.
toR
educeDims
,
args
.
r
educeDims
,
args
.
reduceOp
,
args
.
nanOpt
,
args
.
indicesOpt
,
args
.
scales
[
0
],
args
.
scales
[
1
]);
}
else
if
(
args
.
compTypeId
==
appFloat
)
else
if
(
args
.
compTypeId
==
AppDataType
::
appFloat
)
{
profile_reduce_impl
<
ck
::
half_t
,
float
,
ck
::
half_t
>
(
args
.
do_verification
,
args
.
init_method
,
...
...
@@ -360,7 +371,7 @@ int profile_reduce(int argc, char* argv[])
args
.
do_dumpout
,
args
.
nrepeat
,
args
.
inLengths
,
args
.
toR
educeDims
,
args
.
r
educeDims
,
args
.
reduceOp
,
args
.
nanOpt
,
args
.
indicesOpt
,
...
...
@@ -378,16 +389,83 @@ int profile_reduce(int argc, char* argv[])
args
.
do_dumpout
,
args
.
nrepeat
,
args
.
inLengths
,
args
.
toR
educeDims
,
args
.
r
educeDims
,
args
.
reduceOp
,
args
.
nanOpt
,
args
.
indicesOpt
,
args
.
scales
[
0
],
args
.
scales
[
1
]);
}
else
if
(
args
.
use_int8
)
{
if
(
!
args
.
compType_assigned
)
args
.
compTypeId
=
AppDataType
::
appInt8
;
if
(
args
.
outType_assigned
&&
(
args
.
outTypeId
!=
AppDataType
::
appInt8
&&
args
.
outTypeId
!=
AppDataType
::
appInt32
))
args
.
outTypeId
=
AppDataType
::
appInt32
;
if
(
!
args
.
outType_assigned
)
args
.
outTypeId
=
AppDataType
::
appInt8
;
if
(
args
.
compTypeId
==
AppDataType
::
appInt8
)
{
profile_reduce_impl
<
int8_t
,
int8_t
,
int8_t
>
(
args
.
do_verification
,
args
.
init_method
,
args
.
do_log
,
args
.
do_dumpout
,
args
.
nrepeat
,
args
.
inLengths
,
args
.
reduceDims
,
args
.
reduceOp
,
args
.
nanOpt
,
args
.
indicesOpt
,
args
.
scales
[
0
],
args
.
scales
[
1
]);
}
else
if
(
args
.
compTypeId
==
AppDataType
::
appInt32
)
{
profile_reduce_impl
<
int8_t
,
int32_t
,
int8_t
>
(
args
.
do_verification
,
args
.
init_method
,
args
.
do_log
,
args
.
do_dumpout
,
args
.
nrepeat
,
args
.
inLengths
,
args
.
reduceDims
,
args
.
reduceOp
,
args
.
nanOpt
,
args
.
indicesOpt
,
args
.
scales
[
0
],
args
.
scales
[
1
]);
}
else
throw
std
::
runtime_error
(
"Invalid compType assignment!"
);
}
else
if
(
args
.
use_bf16
)
{
if
(
args
.
outType_assigned
&&
(
args
.
outTypeId
!=
AppDataType
::
appBFloat16
&&
args
.
outTypeId
!=
AppDataType
::
appFloat
))
args
.
outTypeId
=
AppDataType
::
appFloat
;
if
(
!
args
.
outType_assigned
)
args
.
outTypeId
=
AppDataType
::
appBFloat16
;
profile_reduce_impl
<
ck
::
bhalf_t
,
float
,
ck
::
bhalf_t
>
(
args
.
do_verification
,
args
.
init_method
,
args
.
do_log
,
args
.
do_dumpout
,
args
.
nrepeat
,
args
.
inLengths
,
args
.
reduceDims
,
args
.
reduceOp
,
args
.
nanOpt
,
args
.
indicesOpt
,
args
.
scales
[
0
],
args
.
scales
[
1
]);
}
else
{
if
(
args
.
compTypeId
==
appFloat
)
if
(
args
.
compTypeId
==
AppDataType
::
appFloat
)
{
profile_reduce_impl
<
float
,
float
,
float
>
(
args
.
do_verification
,
args
.
init_method
,
...
...
@@ -395,14 +473,14 @@ int profile_reduce(int argc, char* argv[])
args
.
do_dumpout
,
args
.
nrepeat
,
args
.
inLengths
,
args
.
toR
educeDims
,
args
.
r
educeDims
,
args
.
reduceOp
,
args
.
nanOpt
,
args
.
indicesOpt
,
args
.
scales
[
0
],
args
.
scales
[
1
]);
}
else
if
(
args
.
compTypeId
==
appDouble
)
else
if
(
args
.
compTypeId
==
AppDataType
::
appDouble
)
{
profile_reduce_impl
<
float
,
double
,
float
>
(
args
.
do_verification
,
args
.
init_method
,
...
...
@@ -410,7 +488,7 @@ int profile_reduce(int argc, char* argv[])
args
.
do_dumpout
,
args
.
nrepeat
,
args
.
inLengths
,
args
.
toR
educeDims
,
args
.
r
educeDims
,
args
.
reduceOp
,
args
.
nanOpt
,
args
.
indicesOpt
,
...
...
profiler/src/profiler.cpp
View file @
dd6a8de4
...
...
@@ -5,16 +5,20 @@
#include <cstring>
int
profile_gemm
(
int
,
char
*
[]);
int
profile_batched_gemm
(
int
,
char
*
[]);
int
profile_gemm_bias_2d
(
int
,
char
*
[]);
int
profile_gemm_bias_relu
(
int
,
char
*
[]);
int
profile_gemm_bias_relu_add
(
int
,
char
*
[]);
int
profile_gemm_reduce
(
int
,
char
*
[]);
int
profile_batched_gemm
(
int
,
char
*
[]);
int
profile_grouped_gemm
(
int
,
char
*
[]);
int
profile_conv_fwd
(
int
,
char
*
[]);
int
profile_conv_fwd_bias_relu
(
int
,
char
*
[]);
int
profile_conv_fwd_bias_relu_add
(
int
,
char
*
[]);
int
profile_conv_fwd_bias_relu_atomic_add
(
int
,
char
*
[]);
int
profile_conv_bwd_data
(
int
,
char
*
[]);
int
profile_conv
nd
_bwd_data
(
int
,
char
*
[]
,
int
);
int
profile_reduce
(
int
,
char
*
[]);
int
profile_conv_bwd_weight
(
int
,
char
*
[]);
int
profile_batched_gemm_reduce
(
int
,
char
*
[]);
int
main
(
int
argc
,
char
*
argv
[])
{
...
...
@@ -34,10 +38,22 @@ int main(int argc, char* argv[])
{
return
profile_gemm_bias_relu_add
(
argc
,
argv
);
}
else
if
(
strcmp
(
argv
[
1
],
"gemm_reduce"
)
==
0
)
{
return
profile_gemm_reduce
(
argc
,
argv
);
}
else
if
(
strcmp
(
argv
[
1
],
"batched_gemm"
)
==
0
)
{
return
profile_batched_gemm
(
argc
,
argv
);
}
else
if
(
strcmp
(
argv
[
1
],
"batched_gemm_reduce"
)
==
0
)
{
return
profile_batched_gemm_reduce
(
argc
,
argv
);
}
else
if
(
strcmp
(
argv
[
1
],
"grouped_gemm"
)
==
0
)
{
profile_grouped_gemm
(
argc
,
argv
);
}
else
if
(
strcmp
(
argv
[
1
],
"conv_fwd"
)
==
0
)
{
return
profile_conv_fwd
(
argc
,
argv
);
...
...
@@ -54,14 +70,26 @@ int main(int argc, char* argv[])
{
return
profile_conv_fwd_bias_relu_atomic_add
(
argc
,
argv
);
}
else
if
(
strcmp
(
argv
[
1
],
"conv_bwd"
)
==
0
)
else
if
(
strcmp
(
argv
[
1
],
"conv
1d
_bwd
_data
"
)
==
0
)
{
return
profile_conv_bwd_data
(
argc
,
argv
);
return
profile_convnd_bwd_data
(
argc
,
argv
,
1
);
}
else
if
(
strcmp
(
argv
[
1
],
"conv2d_bwd_data"
)
==
0
)
{
return
profile_convnd_bwd_data
(
argc
,
argv
,
2
);
}
else
if
(
strcmp
(
argv
[
1
],
"conv3d_bwd_data"
)
==
0
)
{
return
profile_convnd_bwd_data
(
argc
,
argv
,
3
);
}
else
if
(
strcmp
(
argv
[
1
],
"reduce"
)
==
0
)
{
return
profile_reduce
(
argc
,
argv
);
}
else
if
(
strcmp
(
argv
[
1
],
"conv2d_bwd_weight"
)
==
0
)
{
return
profile_conv_bwd_weight
(
argc
,
argv
);
}
else
{
// clang-format off
...
...
@@ -69,14 +97,18 @@ int main(int argc, char* argv[])
" gemm_bias_2d: GEMM+Bias(2D)
\n
"
" gemm_bias_relu: GEMM+Bias+ReLU
\n
"
" gemm_bias_relu_add: GEMM+Bias+ReLU+Add
\n
"
" gemm_reduce: GEMM+Reduce
\n
"
" grouped_gemm: Grouped GEMM
\n
"
" conv_fwd: ForwardConvolution
\n
"
" conv_fwd_bias_relu: ForwardConvolution+Bias+ReLU
\n
"
" conv_fwd_bias_relu_add: ForwardConvolution+Bias+ReLU+Add
\n
"
" conv_fwd_bias_relu_atomic_add: ForwardConvolution+Bias+ReLU+AtomicAdd
\n
"
" conv_bwd: BackwardConvolution
\n
"
" reduce: REDUCE
\n
"
);
" conv1d_bwd_data: BackwardConvolution data 1 dim
\n
"
" conv2d_bwd_data: BackwardConvolution data 2 dim
\n
"
" conv3d_bwd_data: BackwardConvolution data 3 dim
\n
"
" reduce: REDUCE
\n
"
" conv2d_bwd_weight: Backward Weight Convolution 2d
\n
"
);
// clang-format on
return
0
;
}
return
0
;
}
script/cmake-rocm.sh
View file @
dd6a8de4
...
...
@@ -3,16 +3,18 @@ rm -f CMakeCache.txt
rm
-f
*
.cmake
rm
-rf
CMakeFiles
MY_PROJECT_SOURCE
=
../
../..
MY_PROJECT_SOURCE
=
../
MY_PROJECT_INSTALL
=
../install.dir
cmake
\
-D
CMAKE_INSTALL_PREFIX
=
${
MY_PROJECT_INSTALL
}
\
-D
BUILD_DEV
=
OFF
\
-D
CMAKE_BUILD_TYPE
=
Release
\
-D
CMAKE_CXX_FLAGS
=
"
-DCK_AMD_GPU_GFX908 --amdgpu-target
=gfx90
8
-O3 -ftemplate-backtrace-limit=0
-mllvm --amdgpu-spill-vgpr-to-agpr=0
-gline-tables-only -save-temps=
$PWD
"
\
-D
CMAKE_CXX_FLAGS
=
"
--offload-arch=gfx908 --offload-arch
=gfx90
a
-O3 -ftemplate-backtrace-limit=0 -gline-tables-only -save-temps=
$PWD
"
\
-D
CMAKE_CXX_COMPILER
=
/opt/rocm/bin/hipcc
\
-D
CMAKE_PREFIX_PATH
=
/opt/rocm
\
-D
CMAKE_VERBOSE_MAKEFILE:BOOL
=
ON
\
${
MY_PROJECT_SOURCE
}
#-D CMAKE_CXX_FLAGS=" --offload-arch=gfx908 --offload-arch=gfx90a -O3 -ftemplate-backtrace-limit=0 -mllvm --amdgpu-spill-vgpr-to-agpr=0 -gline-tables-only -save-temps=$PWD" \
#-D CMAKE_CXX_FLAGS=" --offload-arch=gfx908 --offload-arch=gfx90a -O3 -ftemplate-backtrace-limit=0 -gline-tables-only -save-temps=$PWD" \
script/count_vgpr.sh
View file @
dd6a8de4
#!/bin/bash
FILE
=
$1
echo
v0
$(
grep
-w
v0
$FILE
|
wc
-l
)
echo
v1
$(
grep
-w
v1
$FILE
|
wc
-l
)
echo
v2
$(
grep
-w
v2
$FILE
|
wc
-l
)
echo
v3
$(
grep
-w
v3
$FILE
|
wc
-l
)
echo
v4
$(
grep
-w
v4
$FILE
|
wc
-l
)
echo
v5
$(
grep
-w
v5
$FILE
|
wc
-l
)
echo
v6
$(
grep
-w
v6
$FILE
|
wc
-l
)
echo
v7
$(
grep
-w
v7
$FILE
|
wc
-l
)
echo
v8
$(
grep
-w
v8
$FILE
|
wc
-l
)
echo
v9
$(
grep
-w
v9
$FILE
|
wc
-l
)
echo
v10
$(
grep
-w
v10
$FILE
|
wc
-l
)
echo
v11
$(
grep
-w
v11
$FILE
|
wc
-l
)
echo
v12
$(
grep
-w
v12
$FILE
|
wc
-l
)
echo
v13
$(
grep
-w
v13
$FILE
|
wc
-l
)
echo
v14
$(
grep
-w
v14
$FILE
|
wc
-l
)
echo
v15
$(
grep
-w
v15
$FILE
|
wc
-l
)
echo
v16
$(
grep
-w
v16
$FILE
|
wc
-l
)
echo
v17
$(
grep
-w
v17
$FILE
|
wc
-l
)
echo
v18
$(
grep
-w
v18
$FILE
|
wc
-l
)
echo
v19
$(
grep
-w
v19
$FILE
|
wc
-l
)
echo
v20
$(
grep
-w
v20
$FILE
|
wc
-l
)
echo
v21
$(
grep
-w
v21
$FILE
|
wc
-l
)
echo
v22
$(
grep
-w
v22
$FILE
|
wc
-l
)
echo
v23
$(
grep
-w
v23
$FILE
|
wc
-l
)
echo
v24
$(
grep
-w
v24
$FILE
|
wc
-l
)
echo
v25
$(
grep
-w
v25
$FILE
|
wc
-l
)
echo
v26
$(
grep
-w
v26
$FILE
|
wc
-l
)
echo
v27
$(
grep
-w
v27
$FILE
|
wc
-l
)
echo
v28
$(
grep
-w
v28
$FILE
|
wc
-l
)
echo
v29
$(
grep
-w
v29
$FILE
|
wc
-l
)
echo
v30
$(
grep
-w
v30
$FILE
|
wc
-l
)
echo
v31
$(
grep
-w
v31
$FILE
|
wc
-l
)
echo
v32
$(
grep
-w
v32
$FILE
|
wc
-l
)
echo
v33
$(
grep
-w
v33
$FILE
|
wc
-l
)
echo
v34
$(
grep
-w
v34
$FILE
|
wc
-l
)
echo
v35
$(
grep
-w
v35
$FILE
|
wc
-l
)
echo
v36
$(
grep
-w
v36
$FILE
|
wc
-l
)
echo
v37
$(
grep
-w
v37
$FILE
|
wc
-l
)
echo
v38
$(
grep
-w
v38
$FILE
|
wc
-l
)
echo
v39
$(
grep
-w
v39
$FILE
|
wc
-l
)
echo
v40
$(
grep
-w
v40
$FILE
|
wc
-l
)
echo
v41
$(
grep
-w
v41
$FILE
|
wc
-l
)
echo
v42
$(
grep
-w
v42
$FILE
|
wc
-l
)
echo
v43
$(
grep
-w
v43
$FILE
|
wc
-l
)
echo
v44
$(
grep
-w
v44
$FILE
|
wc
-l
)
echo
v45
$(
grep
-w
v45
$FILE
|
wc
-l
)
echo
v46
$(
grep
-w
v46
$FILE
|
wc
-l
)
echo
v47
$(
grep
-w
v47
$FILE
|
wc
-l
)
echo
v48
$(
grep
-w
v48
$FILE
|
wc
-l
)
echo
v49
$(
grep
-w
v49
$FILE
|
wc
-l
)
echo
v50
$(
grep
-w
v50
$FILE
|
wc
-l
)
echo
v51
$(
grep
-w
v51
$FILE
|
wc
-l
)
echo
v52
$(
grep
-w
v52
$FILE
|
wc
-l
)
echo
v53
$(
grep
-w
v53
$FILE
|
wc
-l
)
echo
v54
$(
grep
-w
v54
$FILE
|
wc
-l
)
echo
v55
$(
grep
-w
v55
$FILE
|
wc
-l
)
echo
v56
$(
grep
-w
v56
$FILE
|
wc
-l
)
echo
v57
$(
grep
-w
v57
$FILE
|
wc
-l
)
echo
v58
$(
grep
-w
v58
$FILE
|
wc
-l
)
echo
v59
$(
grep
-w
v59
$FILE
|
wc
-l
)
echo
v60
$(
grep
-w
v60
$FILE
|
wc
-l
)
echo
v61
$(
grep
-w
v61
$FILE
|
wc
-l
)
echo
v62
$(
grep
-w
v62
$FILE
|
wc
-l
)
echo
v63
$(
grep
-w
v63
$FILE
|
wc
-l
)
echo
v64
$(
grep
-w
v64
$FILE
|
wc
-l
)
echo
v65
$(
grep
-w
v65
$FILE
|
wc
-l
)
echo
v66
$(
grep
-w
v66
$FILE
|
wc
-l
)
echo
v67
$(
grep
-w
v67
$FILE
|
wc
-l
)
echo
v68
$(
grep
-w
v68
$FILE
|
wc
-l
)
echo
v69
$(
grep
-w
v69
$FILE
|
wc
-l
)
echo
v70
$(
grep
-w
v70
$FILE
|
wc
-l
)
echo
v71
$(
grep
-w
v71
$FILE
|
wc
-l
)
echo
v72
$(
grep
-w
v72
$FILE
|
wc
-l
)
echo
v73
$(
grep
-w
v73
$FILE
|
wc
-l
)
echo
v74
$(
grep
-w
v74
$FILE
|
wc
-l
)
echo
v75
$(
grep
-w
v75
$FILE
|
wc
-l
)
echo
v76
$(
grep
-w
v76
$FILE
|
wc
-l
)
echo
v77
$(
grep
-w
v77
$FILE
|
wc
-l
)
echo
v78
$(
grep
-w
v78
$FILE
|
wc
-l
)
echo
v79
$(
grep
-w
v79
$FILE
|
wc
-l
)
echo
v80
$(
grep
-w
v80
$FILE
|
wc
-l
)
echo
v81
$(
grep
-w
v81
$FILE
|
wc
-l
)
echo
v82
$(
grep
-w
v82
$FILE
|
wc
-l
)
echo
v83
$(
grep
-w
v83
$FILE
|
wc
-l
)
echo
v84
$(
grep
-w
v84
$FILE
|
wc
-l
)
echo
v85
$(
grep
-w
v85
$FILE
|
wc
-l
)
echo
v86
$(
grep
-w
v86
$FILE
|
wc
-l
)
echo
v87
$(
grep
-w
v87
$FILE
|
wc
-l
)
echo
v88
$(
grep
-w
v88
$FILE
|
wc
-l
)
echo
v89
$(
grep
-w
v89
$FILE
|
wc
-l
)
echo
v90
$(
grep
-w
v90
$FILE
|
wc
-l
)
echo
v91
$(
grep
-w
v91
$FILE
|
wc
-l
)
echo
v92
$(
grep
-w
v92
$FILE
|
wc
-l
)
echo
v93
$(
grep
-w
v93
$FILE
|
wc
-l
)
echo
v94
$(
grep
-w
v94
$FILE
|
wc
-l
)
echo
v95
$(
grep
-w
v95
$FILE
|
wc
-l
)
echo
v96
$(
grep
-w
v96
$FILE
|
wc
-l
)
echo
v97
$(
grep
-w
v97
$FILE
|
wc
-l
)
echo
v98
$(
grep
-w
v98
$FILE
|
wc
-l
)
echo
v99
$(
grep
-w
v99
$FILE
|
wc
-l
)
echo
v100
$(
grep
-w
v100
$FILE
|
wc
-l
)
echo
v101
$(
grep
-w
v101
$FILE
|
wc
-l
)
echo
v102
$(
grep
-w
v102
$FILE
|
wc
-l
)
echo
v103
$(
grep
-w
v103
$FILE
|
wc
-l
)
echo
v104
$(
grep
-w
v104
$FILE
|
wc
-l
)
echo
v105
$(
grep
-w
v105
$FILE
|
wc
-l
)
echo
v106
$(
grep
-w
v106
$FILE
|
wc
-l
)
echo
v107
$(
grep
-w
v107
$FILE
|
wc
-l
)
echo
v108
$(
grep
-w
v108
$FILE
|
wc
-l
)
echo
v109
$(
grep
-w
v109
$FILE
|
wc
-l
)
echo
v110
$(
grep
-w
v110
$FILE
|
wc
-l
)
echo
v111
$(
grep
-w
v111
$FILE
|
wc
-l
)
echo
v112
$(
grep
-w
v112
$FILE
|
wc
-l
)
echo
v113
$(
grep
-w
v113
$FILE
|
wc
-l
)
echo
v114
$(
grep
-w
v114
$FILE
|
wc
-l
)
echo
v115
$(
grep
-w
v115
$FILE
|
wc
-l
)
echo
v116
$(
grep
-w
v116
$FILE
|
wc
-l
)
echo
v117
$(
grep
-w
v117
$FILE
|
wc
-l
)
echo
v118
$(
grep
-w
v118
$FILE
|
wc
-l
)
echo
v119
$(
grep
-w
v119
$FILE
|
wc
-l
)
echo
v120
$(
grep
-w
v120
$FILE
|
wc
-l
)
echo
v121
$(
grep
-w
v121
$FILE
|
wc
-l
)
echo
v122
$(
grep
-w
v122
$FILE
|
wc
-l
)
echo
v123
$(
grep
-w
v123
$FILE
|
wc
-l
)
echo
v124
$(
grep
-w
v124
$FILE
|
wc
-l
)
echo
v125
$(
grep
-w
v125
$FILE
|
wc
-l
)
echo
v126
$(
grep
-w
v126
$FILE
|
wc
-l
)
echo
v127
$(
grep
-w
v127
$FILE
|
wc
-l
)
echo
v128
$(
grep
-w
v128
$FILE
|
wc
-l
)
echo
v129
$(
grep
-w
v129
$FILE
|
wc
-l
)
echo
v130
$(
grep
-w
v130
$FILE
|
wc
-l
)
echo
v131
$(
grep
-w
v131
$FILE
|
wc
-l
)
echo
v132
$(
grep
-w
v132
$FILE
|
wc
-l
)
echo
v133
$(
grep
-w
v133
$FILE
|
wc
-l
)
echo
v134
$(
grep
-w
v134
$FILE
|
wc
-l
)
echo
v135
$(
grep
-w
v135
$FILE
|
wc
-l
)
echo
v136
$(
grep
-w
v136
$FILE
|
wc
-l
)
echo
v137
$(
grep
-w
v137
$FILE
|
wc
-l
)
echo
v138
$(
grep
-w
v138
$FILE
|
wc
-l
)
echo
v139
$(
grep
-w
v139
$FILE
|
wc
-l
)
echo
v140
$(
grep
-w
v140
$FILE
|
wc
-l
)
echo
v141
$(
grep
-w
v141
$FILE
|
wc
-l
)
echo
v142
$(
grep
-w
v142
$FILE
|
wc
-l
)
echo
v143
$(
grep
-w
v143
$FILE
|
wc
-l
)
echo
v144
$(
grep
-w
v144
$FILE
|
wc
-l
)
echo
v145
$(
grep
-w
v145
$FILE
|
wc
-l
)
echo
v146
$(
grep
-w
v146
$FILE
|
wc
-l
)
echo
v147
$(
grep
-w
v147
$FILE
|
wc
-l
)
echo
v148
$(
grep
-w
v148
$FILE
|
wc
-l
)
echo
v149
$(
grep
-w
v149
$FILE
|
wc
-l
)
echo
v150
$(
grep
-w
v150
$FILE
|
wc
-l
)
echo
v151
$(
grep
-w
v151
$FILE
|
wc
-l
)
echo
v152
$(
grep
-w
v152
$FILE
|
wc
-l
)
echo
v153
$(
grep
-w
v153
$FILE
|
wc
-l
)
echo
v154
$(
grep
-w
v154
$FILE
|
wc
-l
)
echo
v155
$(
grep
-w
v155
$FILE
|
wc
-l
)
echo
v156
$(
grep
-w
v156
$FILE
|
wc
-l
)
echo
v157
$(
grep
-w
v157
$FILE
|
wc
-l
)
echo
v158
$(
grep
-w
v158
$FILE
|
wc
-l
)
echo
v159
$(
grep
-w
v159
$FILE
|
wc
-l
)
echo
v160
$(
grep
-w
v160
$FILE
|
wc
-l
)
echo
v161
$(
grep
-w
v161
$FILE
|
wc
-l
)
echo
v162
$(
grep
-w
v162
$FILE
|
wc
-l
)
echo
v163
$(
grep
-w
v163
$FILE
|
wc
-l
)
echo
v164
$(
grep
-w
v164
$FILE
|
wc
-l
)
echo
v165
$(
grep
-w
v165
$FILE
|
wc
-l
)
echo
v166
$(
grep
-w
v166
$FILE
|
wc
-l
)
echo
v167
$(
grep
-w
v167
$FILE
|
wc
-l
)
echo
v168
$(
grep
-w
v168
$FILE
|
wc
-l
)
echo
v169
$(
grep
-w
v169
$FILE
|
wc
-l
)
echo
v170
$(
grep
-w
v170
$FILE
|
wc
-l
)
echo
v171
$(
grep
-w
v171
$FILE
|
wc
-l
)
echo
v172
$(
grep
-w
v172
$FILE
|
wc
-l
)
echo
v173
$(
grep
-w
v173
$FILE
|
wc
-l
)
echo
v174
$(
grep
-w
v174
$FILE
|
wc
-l
)
echo
v175
$(
grep
-w
v175
$FILE
|
wc
-l
)
echo
v176
$(
grep
-w
v176
$FILE
|
wc
-l
)
echo
v177
$(
grep
-w
v177
$FILE
|
wc
-l
)
echo
v178
$(
grep
-w
v178
$FILE
|
wc
-l
)
echo
v179
$(
grep
-w
v179
$FILE
|
wc
-l
)
echo
v180
$(
grep
-w
v180
$FILE
|
wc
-l
)
echo
v181
$(
grep
-w
v181
$FILE
|
wc
-l
)
echo
v182
$(
grep
-w
v182
$FILE
|
wc
-l
)
echo
v183
$(
grep
-w
v183
$FILE
|
wc
-l
)
echo
v184
$(
grep
-w
v184
$FILE
|
wc
-l
)
echo
v185
$(
grep
-w
v185
$FILE
|
wc
-l
)
echo
v186
$(
grep
-w
v186
$FILE
|
wc
-l
)
echo
v187
$(
grep
-w
v187
$FILE
|
wc
-l
)
echo
v188
$(
grep
-w
v188
$FILE
|
wc
-l
)
echo
v189
$(
grep
-w
v189
$FILE
|
wc
-l
)
echo
v190
$(
grep
-w
v190
$FILE
|
wc
-l
)
echo
v191
$(
grep
-w
v191
$FILE
|
wc
-l
)
echo
v192
$(
grep
-w
v192
$FILE
|
wc
-l
)
echo
v193
$(
grep
-w
v193
$FILE
|
wc
-l
)
echo
v194
$(
grep
-w
v194
$FILE
|
wc
-l
)
echo
v195
$(
grep
-w
v195
$FILE
|
wc
-l
)
echo
v196
$(
grep
-w
v196
$FILE
|
wc
-l
)
echo
v197
$(
grep
-w
v197
$FILE
|
wc
-l
)
echo
v198
$(
grep
-w
v198
$FILE
|
wc
-l
)
echo
v199
$(
grep
-w
v199
$FILE
|
wc
-l
)
echo
v200
$(
grep
-w
v200
$FILE
|
wc
-l
)
echo
v201
$(
grep
-w
v201
$FILE
|
wc
-l
)
echo
v202
$(
grep
-w
v202
$FILE
|
wc
-l
)
echo
v203
$(
grep
-w
v203
$FILE
|
wc
-l
)
echo
v204
$(
grep
-w
v204
$FILE
|
wc
-l
)
echo
v205
$(
grep
-w
v205
$FILE
|
wc
-l
)
echo
v206
$(
grep
-w
v206
$FILE
|
wc
-l
)
echo
v207
$(
grep
-w
v207
$FILE
|
wc
-l
)
echo
v208
$(
grep
-w
v208
$FILE
|
wc
-l
)
echo
v209
$(
grep
-w
v209
$FILE
|
wc
-l
)
echo
v210
$(
grep
-w
v210
$FILE
|
wc
-l
)
echo
v211
$(
grep
-w
v211
$FILE
|
wc
-l
)
echo
v212
$(
grep
-w
v212
$FILE
|
wc
-l
)
echo
v213
$(
grep
-w
v213
$FILE
|
wc
-l
)
echo
v214
$(
grep
-w
v214
$FILE
|
wc
-l
)
echo
v215
$(
grep
-w
v215
$FILE
|
wc
-l
)
echo
v216
$(
grep
-w
v216
$FILE
|
wc
-l
)
echo
v217
$(
grep
-w
v217
$FILE
|
wc
-l
)
echo
v218
$(
grep
-w
v218
$FILE
|
wc
-l
)
echo
v219
$(
grep
-w
v219
$FILE
|
wc
-l
)
echo
v220
$(
grep
-w
v220
$FILE
|
wc
-l
)
echo
v221
$(
grep
-w
v221
$FILE
|
wc
-l
)
echo
v222
$(
grep
-w
v222
$FILE
|
wc
-l
)
echo
v223
$(
grep
-w
v223
$FILE
|
wc
-l
)
echo
v224
$(
grep
-w
v224
$FILE
|
wc
-l
)
echo
v225
$(
grep
-w
v225
$FILE
|
wc
-l
)
echo
v226
$(
grep
-w
v226
$FILE
|
wc
-l
)
echo
v227
$(
grep
-w
v227
$FILE
|
wc
-l
)
echo
v228
$(
grep
-w
v228
$FILE
|
wc
-l
)
echo
v229
$(
grep
-w
v229
$FILE
|
wc
-l
)
echo
v230
$(
grep
-w
v230
$FILE
|
wc
-l
)
echo
v231
$(
grep
-w
v231
$FILE
|
wc
-l
)
echo
v232
$(
grep
-w
v232
$FILE
|
wc
-l
)
echo
v233
$(
grep
-w
v233
$FILE
|
wc
-l
)
echo
v234
$(
grep
-w
v234
$FILE
|
wc
-l
)
echo
v235
$(
grep
-w
v235
$FILE
|
wc
-l
)
echo
v236
$(
grep
-w
v236
$FILE
|
wc
-l
)
echo
v237
$(
grep
-w
v237
$FILE
|
wc
-l
)
echo
v238
$(
grep
-w
v238
$FILE
|
wc
-l
)
echo
v239
$(
grep
-w
v239
$FILE
|
wc
-l
)
echo
v240
$(
grep
-w
v240
$FILE
|
wc
-l
)
echo
v241
$(
grep
-w
v241
$FILE
|
wc
-l
)
echo
v242
$(
grep
-w
v242
$FILE
|
wc
-l
)
echo
v243
$(
grep
-w
v243
$FILE
|
wc
-l
)
echo
v244
$(
grep
-w
v244
$FILE
|
wc
-l
)
echo
v245
$(
grep
-w
v245
$FILE
|
wc
-l
)
echo
v246
$(
grep
-w
v246
$FILE
|
wc
-l
)
echo
v247
$(
grep
-w
v247
$FILE
|
wc
-l
)
echo
v248
$(
grep
-w
v248
$FILE
|
wc
-l
)
echo
v249
$(
grep
-w
v249
$FILE
|
wc
-l
)
echo
v250
$(
grep
-w
v250
$FILE
|
wc
-l
)
echo
v251
$(
grep
-w
v251
$FILE
|
wc
-l
)
echo
v252
$(
grep
-w
v252
$FILE
|
wc
-l
)
echo
v253
$(
grep
-w
v253
$FILE
|
wc
-l
)
echo
v254
$(
grep
-w
v254
$FILE
|
wc
-l
)
echo
v255
$(
grep
-w
v255
$FILE
|
wc
-l
)
for
num
in
{
0..255
}
do
base_pattern
=
"(
\[
?
${
num
}
\b
|
\[\d
*:
${
num
}
\]
)"
spattern
=
"s
${
base_pattern
}
"
vpattern
=
"v
${
base_pattern
}
"
apattern
=
"a
${
base_pattern
}
"
scount
=
$(
grep
-P
$spattern
$FILE
|
wc
-l
)
vcount
=
$(
grep
-P
$vpattern
$FILE
|
wc
-l
)
acount
=
$(
grep
-P
$apattern
$FILE
|
wc
-l
)
bash
-c
"echo -n v
${
num
}
$vcount
&&
\
echo -n , s
${
num
}
$scount
&&
\
echo -n , a
${
num
}
$acount
"
if
[[
$scount
-ne
0
||
$vcount
-ne
0
||
$acount
-ne
0
]]
;
then
echo
-n
" *"
fi
echo
""
done
script/profile_reduce_no_index.sh
View file @
dd6a8de4
#!/bin/bash
PRECISION
=
##--half
PRECISION
=
##PRECISION=--half
##PRECISION=--double
##PRECISION=--int8
##PRECISION=--bf16
if
test
-n
$PRECISION
&&
test
"
$PRECISION
"
=
"--half"
;
then
CTYPE
=
"-C 1"
el
se
CTYPE
=
""
if
[
-n
$PRECISION
]
&&
[
"
$PRECISION
"
=
"--half"
-o
"
$PRECISION
"
=
"--bf16"
]
;
then
AC
CTYPE
=
"-C 1"
el
if
[
-n
$PRECISION
]
&&
[
"
$PRECISION
"
=
"--int8"
]
;
then
AC
CTYPE
=
"
-C 2
"
fi
WTYPE
=
if
[
$#
-ge
1
]
;
then
NREPEAT
=
$1
else
NREPEAT
=
1
fi
driver
=
"./bin/ckProfiler"
VERIFY
=
"-v
$1
"
INIT
=
$2
NREPEAT
=
$3
Operation
=
7
#### 0 - ADD, 5 - AVG, 7 - NORM2
Operations
=
"0 5 7"
#### 0 - ADD, 5 - AVG, for int8, no NORM2 supported
if
[
-n
$PRECISION
]
&&
[
"
$PRECISION
"
=
"--int8"
]
;
then
Operations
=
5
fi
## for generic validation
for
op
in
$Operation
;
do
for
op
in
$Operation
s
;
do
set
-x
./bin/ckProfiler reduce
$PRECISION
-D
64,4,280,82
-R
0
-O
$op
$CTYPE
-v
1 1
$NREPEAT
./bin/ckProfiler reduce
$PRECISION
-D
4,64,280,82
-R
0
-O
$op
$CTYPE
-v
1 1
$NREPEAT
./bin/ckProfiler reduce
$PRECISION
-D
280,4,64,82
-R
0
-O
$op
$CTYPE
-v
1 1
$NREPEAT
./bin/ckProfiler reduce
$PRECISION
-D
64,4,280,82
-R
0,1,2
-O
$op
$CTYPE
-v
1 1
$NREPEAT
./bin/ckProfiler reduce
$PRECISION
-D
4,64,280,82
-R
0,1,2
-O
$op
$CTYPE
-v
1 1
$NREPEAT
./bin/ckProfiler reduce
$PRECISION
-D
64,280,82,4
-R
0,1,2
-O
$op
$CTYPE
-v
1 1
$NREPEAT
./bin/ckProfiler reduce
$PRECISION
-D
700,8192
-R
1
-O
$op
$CTYPE
-v
1 1
$NREPEAT
./bin/ckProfiler reduce
$PRECISION
-D
700,1024
-R
1
-O
$op
$CTYPE
-v
1 1
$NREPEAT
./bin/ckProfiler reduce
$PRECISION
-D
700,4
-R
1
-O
$op
$CTYPE
-v
1 1
$NREPEAT
####### datatype layout reduce dims op acctype verify init repeats
$driver
reduce
$PRECISION
-D
64,4,280,82
-R
0,1,2,3
-O
$op
$ACCTYPE
$VERIFY
$INIT
$NREPEAT
$driver
reduce
$PRECISION
-D
64,4,280,82
-R
0
-O
$op
$ACCTYPE
$VERIFY
$INIT
$NREPEAT
$driver
reduce
$PRECISION
-D
64,4,280,82
-R
1
-O
$op
$ACCTYPE
$VERIFY
$INIT
$NREPEAT
$driver
reduce
$PRECISION
-D
64,4,280,82
-R
2
-O
$op
$ACCTYPE
$VERIFY
$INIT
$NREPEAT
$driver
reduce
$PRECISION
-D
64,4,280,82
-R
3
-O
$op
$ACCTYPE
$VERIFY
$INIT
$NREPEAT
$driver
reduce
$PRECISION
-D
64,4,280,82
-R
0,1,2
-O
$op
$ACCTYPE
$VERIFY
$INIT
$NREPEAT
$driver
reduce
$PRECISION
-D
64,4,280,82
-R
1,2,3
-O
$op
$ACCTYPE
$VERIFY
$INIT
$NREPEAT
$driver
reduce
$PRECISION
-D
64,4,280,82
-R
0,2,3
-O
$op
$ACCTYPE
$VERIFY
$INIT
$NREPEAT
$driver
reduce
$PRECISION
-D
64,4,280,82
-R
0,1,3
-O
$op
$ACCTYPE
$VERIFY
$INIT
$NREPEAT
$driver
reduce
$PRECISION
-D
256,22960
-R
0
-O
$op
$ACCTYPE
$VERIFY
$INIT
$NREPEAT
$driver
reduce
$PRECISION
-D
256,22960
-R
1
-O
$op
$ACCTYPE
$VERIFY
$INIT
$NREPEAT
$driver
reduce
$PRECISION
-D
4,1469440
-R
0
-O
$op
$ACCTYPE
$VERIFY
$INIT
$NREPEAT
$driver
reduce
$PRECISION
-D
4,1469440
-R
1
-O
$op
$ACCTYPE
$VERIFY
$INIT
$NREPEAT
set
+x
done
Operation
=
5
#### 0 - ADD, 5 - AVG, 7 - NORM2
Operations
=
5
## for performance evaluation (resnet50 NHWC => C)
for
op
in
$Operation
;
do
for
op
in
$Operation
s
;
do
set
-x
./bin/ckProfiler reduce
$PRECISION
-D
256,14,14,1024
-R
0,1,2
-O
$op
$CTYPE
$WTYPE
-v
1 1
$NREPEAT
./bin/ckProfiler reduce
$PRECISION
-D
256,28,28,128
-R
0,1,2
-O
$op
$CTYPE
$WTYPE
-v
1 1
$NREPEAT
./bin/ckProfiler reduce
$PRECISION
-D
256,58,58,128
-R
0,1,2
-O
$op
$CTYPE
$WTYPE
-v
1 1
$NREPEAT
./bin/ckProfiler reduce
$PRECISION
-D
256,7,7,2048
-R
0,1,2
-O
$op
$CTYPE
$WTYPE
-v
1 1
$NREPEAT
./bin/ckProfiler reduce
$PRECISION
-D
256,14,14,256
-R
0,1,2
-O
$op
$CTYPE
$WTYPE
-v
1 1
$NREPEAT
./bin/ckProfiler reduce
$PRECISION
-D
256,30,30,256
-R
0,1,2
-O
$op
$CTYPE
$WTYPE
-v
1 1
$NREPEAT
./bin/ckProfiler reduce
$PRECISION
-D
256,56,56,256
-R
0,1,2
-O
$op
$CTYPE
$WTYPE
-v
1 1
$NREPEAT
./bin/ckProfiler reduce
$PRECISION
-D
256,16,16,512
-R
0,1,2
-O
$op
$CTYPE
$WTYPE
-v
1 1
$NREPEAT
./bin/ckProfiler reduce
$PRECISION
-D
256,28,28,512
-R
0,1,2
-O
$op
$CTYPE
$WTYPE
-v
1 1
$NREPEAT
./bin/ckProfiler reduce
$PRECISION
-D
256,7,7,512
-R
0,1,2
-O
$op
$CTYPE
$WTYPE
-v
1 1
$NREPEAT
./bin/ckProfiler reduce
$PRECISION
-D
256,56,56,64
-R
0,1,2
-O
$op
$CTYPE
$WTYPE
-v
1 1
$NREPEAT
./bin/ckProfiler reduce
$PRECISION
-D
256,230,230,3
-R
0,1,2
-O
$op
$CTYPE
$WTYPE
-v
1 1
$NREPEAT
./bin/ckProfiler reduce
$PRECISION
-D
128,14,14,1024
-R
0,1,2
-O
$op
$CTYPE
$WTYPE
-v
1 1
$NREPEAT
./bin/ckProfiler reduce
$PRECISION
-D
128,28,28,128
-R
0,1,2
-O
$op
$CTYPE
$WTYPE
-v
1 1
$NREPEAT
./bin/ckProfiler reduce
$PRECISION
-D
128,58,58,128
-R
0,1,2
-O
$op
$CTYPE
$WTYPE
-v
1 1
$NREPEAT
./bin/ckProfiler reduce
$PRECISION
-D
128,7,7,2048
-R
0,1,2
-O
$op
$CTYPE
$WTYPE
-v
1 1
$NREPEAT
./bin/ckProfiler reduce
$PRECISION
-D
128,14,14,256
-R
0,1,2
-O
$op
$CTYPE
$WTYPE
-v
1 1
$NREPEAT
./bin/ckProfiler reduce
$PRECISION
-D
128,30,30,256
-R
0,1,2
-O
$op
$CTYPE
$WTYPE
-v
1 1
$NREPEAT
./bin/ckProfiler reduce
$PRECISION
-D
128,56,56,256
-R
0,1,2
-O
$op
$CTYPE
$WTYPE
-v
1 1
$NREPEAT
./bin/ckProfiler reduce
$PRECISION
-D
128,16,16,512
-R
0,1,2
-O
$op
$CTYPE
$WTYPE
-v
1 1
$NREPEAT
./bin/ckProfiler reduce
$PRECISION
-D
128,28,28,512
-R
0,1,2
-O
$op
$CTYPE
$WTYPE
-v
1 1
$NREPEAT
./bin/ckProfiler reduce
$PRECISION
-D
128,7,7,512
-R
0,1,2
-O
$op
$CTYPE
$WTYPE
-v
1 1
$NREPEAT
./bin/ckProfiler reduce
$PRECISION
-D
128,56,56,64
-R
0,1,2
-O
$op
$CTYPE
$WTYPE
-v
1 1
$NREPEAT
####### datatype layout reduce dims op acctype verify init repeats
$driver
reduce
$PRECISION
-D
256,14,14,1024
-R
0,1,2
-O
$op
$ACCTYPE
$VERIFY
$INIT
$NREPEAT
$driver
reduce
$PRECISION
-D
256,28,28,128
-R
0,1,2
-O
$op
$ACCTYPE
$VERIFY
$INIT
$NREPEAT
$driver
reduce
$PRECISION
-D
256,58,58,128
-R
0,1,2
-O
$op
$ACCTYPE
$VERIFY
$INIT
$NREPEAT
$driver
reduce
$PRECISION
-D
256,7,7,2048
-R
0,1,2
-O
$op
$ACCTYPE
$VERIFY
$INIT
$NREPEAT
$driver
reduce
$PRECISION
-D
256,14,14,256
-R
0,1,2
-O
$op
$ACCTYPE
$VERIFY
$INIT
$NREPEAT
$driver
reduce
$PRECISION
-D
256,30,30,256
-R
0,1,2
-O
$op
$ACCTYPE
$VERIFY
$INIT
$NREPEAT
$driver
reduce
$PRECISION
-D
256,56,56,256
-R
0,1,2
-O
$op
$ACCTYPE
$VERIFY
$INIT
$NREPEAT
$driver
reduce
$PRECISION
-D
256,16,16,512
-R
0,1,2
-O
$op
$ACCTYPE
$VERIFY
$INIT
$NREPEAT
$driver
reduce
$PRECISION
-D
256,28,28,512
-R
0,1,2
-O
$op
$ACCTYPE
$VERIFY
$INIT
$NREPEAT
$driver
reduce
$PRECISION
-D
256,7,7,512
-R
0,1,2
-O
$op
$ACCTYPE
$VERIFY
$INIT
$NREPEAT
$driver
reduce
$PRECISION
-D
256,56,56,64
-R
0,1,2
-O
$op
$ACCTYPE
$VERIFY
$INIT
$NREPEAT
$driver
reduce
$PRECISION
-D
256,230,230,3
-R
0,1,2
-O
$op
$ACCTYPE
$VERIFY
$INIT
$NREPEAT
$driver
reduce
$PRECISION
-D
128,14,14,1024
-R
0,1,2
-O
$op
$ACCTYPE
$VERIFY
$INIT
$NREPEAT
$driver
reduce
$PRECISION
-D
128,28,28,128
-R
0,1,2
-O
$op
$ACCTYPE
$VERIFY
$INIT
$NREPEAT
$driver
reduce
$PRECISION
-D
128,58,58,128
-R
0,1,2
-O
$op
$ACCTYPE
$VERIFY
$INIT
$NREPEAT
$driver
reduce
$PRECISION
-D
128,7,7,2048
-R
0,1,2
-O
$op
$ACCTYPE
$VERIFY
$INIT
$NREPEAT
$driver
reduce
$PRECISION
-D
128,14,14,256
-R
0,1,2
-O
$op
$ACCTYPE
$VERIFY
$INIT
$NREPEAT
$driver
reduce
$PRECISION
-D
128,30,30,256
-R
0,1,2
-O
$op
$ACCTYPE
$VERIFY
$INIT
$NREPEAT
$driver
reduce
$PRECISION
-D
128,56,56,256
-R
0,1,2
-O
$op
$ACCTYPE
$VERIFY
$INIT
$NREPEAT
$driver
reduce
$PRECISION
-D
128,16,16,512
-R
0,1,2
-O
$op
$ACCTYPE
$VERIFY
$INIT
$NREPEAT
$driver
reduce
$PRECISION
-D
128,28,28,512
-R
0,1,2
-O
$op
$ACCTYPE
$VERIFY
$INIT
$NREPEAT
$driver
reduce
$PRECISION
-D
128,7,7,512
-R
0,1,2
-O
$op
$ACCTYPE
$VERIFY
$INIT
$NREPEAT
$driver
reduce
$PRECISION
-D
128,56,56,64
-R
0,1,2
-O
$op
$ACCTYPE
$VERIFY
$INIT
$NREPEAT
set
+x
done
script/profile_reduce_with_index.sh
View file @
dd6a8de4
#!/bin/bash
PRECISION
=
##--half
PRECISION
=
##PRECISION=--half
##PRECISION=--double
##PRECISION=--int8
##PRECISION=--bf16
if
[
$#
-ge
1
]
;
then
NREPEAT
=
$1
else
NREPEAT
=
1
fi
driver
=
"./bin/ckProfiler"
Operation
=
4
VERIFY
=
"-v
$1
"
INIT
=
$2
NREPEAT
=
$3
LENGTHS
=
64,4,280,82
#### 2 - MIN, 3 - MAX, 4 - AMAX
Operations
=
"2 4"
## for generic validation
for
op
in
$Operation
;
do
for
op
in
$Operation
s
;
do
for
use_idx
in
0 1
;
do
set
-x
./bin/ckProfiler reduce
$PRECISION
-D
64,4,280,82
-R
0
-O
$op
$CTYPE
-v
1 1
$NREPEAT
./bin/ckProfiler reduce
$PRECISION
-D
4,64,280,82
-R
0
-O
$op
$CTYPE
-v
1 1
$NREPEAT
./bin/ckProfiler reduce
$PRECISION
-D
280,4,64,82
-R
0
-O
$op
$CTYPE
-v
1 1
$NREPEAT
./bin/ckProfiler reduce
$PRECISION
-D
64,4,280,82
-R
0,1,2
-O
$op
$CTYPE
-v
1 1
$NREPEAT
./bin/ckProfiler reduce
$PRECISION
-D
4,64,280,82
-R
0,1,2
-O
$op
$CTYPE
-v
1 1
$NREPEAT
./bin/ckProfiler reduce
$PRECISION
-D
64,280,82,4
-R
0,1,2
-O
$op
$CTYPE
-v
1 1
$NREPEAT
./bin/ckProfiler reduce
$PRECISION
-D
700,8192
-R
1
-O
$op
$CTYPE
-v
1 1
$NREPEAT
./bin/ckProfiler reduce
$PRECISION
-D
700,1024
-R
1
-O
$op
$CTYPE
-v
1 1
$NREPEAT
./bin/ckProfiler reduce
$PRECISION
-D
700,4
-R
1
-O
$op
$CTYPE
-v
1 1
$NREPEAT
####### datatype layout reduce dims op use index verify init repeats
$driver
reduce
$PRECISION
-D
64,4,280,82
-R
0,1,2,3
-O
$op
-I
$use_idx
$VERIFY
$INIT
$NREPEAT
$driver
reduce
$PRECISION
-D
64,4,280,82
-R
0
-O
$op
-I
$use_idx
$VERIFY
$INIT
$NREPEAT
$driver
reduce
$PRECISION
-D
64,4,280,82
-R
1
-O
$op
-I
$use_idx
$VERIFY
$INIT
$NREPEAT
$driver
reduce
$PRECISION
-D
64,4,280,82
-R
2
-O
$op
-I
$use_idx
$VERIFY
$INIT
$NREPEAT
$driver
reduce
$PRECISION
-D
64,4,280,82
-R
3
-O
$op
-I
$use_idx
$VERIFY
$INIT
$NREPEAT
$driver
reduce
$PRECISION
-D
64,4,280,82
-R
0,1,2
-O
$op
-I
$use_idx
$VERIFY
$INIT
$NREPEAT
$driver
reduce
$PRECISION
-D
64,4,280,82
-R
1,2,3
-O
$op
-I
$use_idx
$VERIFY
$INIT
$NREPEAT
$driver
reduce
$PRECISION
-D
64,4,280,82
-R
0,2,3
-O
$op
-I
$use_idx
$VERIFY
$INIT
$NREPEAT
$driver
reduce
$PRECISION
-D
64,4,280,82
-R
0,1,3
-O
$op
-I
$use_idx
$VERIFY
$INIT
$NREPEAT
$driver
reduce
$PRECISION
-D
256,22960
-R
0
-O
$op
-I
$use_idx
$VERIFY
$INIT
$NREPEAT
$driver
reduce
$PRECISION
-D
256,22960
-R
1
-O
$op
-I
$use_idx
$VERIFY
$INIT
$NREPEAT
$driver
reduce
$PRECISION
-D
4,1469440
-R
0
-O
$op
-I
$use_idx
$VERIFY
$INIT
$NREPEAT
$driver
reduce
$PRECISION
-D
4,1469440
-R
1
-O
$op
-I
$use_idx
$VERIFY
$INIT
$NREPEAT
set
+x
done
done
Operations
=
2
## for performance evaluation (resnet50 NHWC => C)
for
op
in
$Operation
;
do
for
op
in
$Operation
s
;
do
for
use_idx
in
0 1
;
do
set
-x
./bin/ckProfiler reduce
$PRECISION
-D
256,14,14,1024
-R
0,1,2
-O
$op
-I
$use_idx
-v
1 1
$NREPEAT
./bin/ckProfiler reduce
$PRECISION
-D
256,28,28,128
-R
0,1,2
-O
$op
-I
$use_idx
-v
1 1
$NREPEAT
./bin/ckProfiler reduce
$PRECISION
-D
256,58,58,128
-R
0,1,2
-O
$op
-I
$use_idx
-v
1 1
$NREPEAT
./bin/ckProfiler reduce
$PRECISION
-D
256,7,7,2048
-R
0,1,2
-O
$op
-I
$use_idx
-v
1 1
$NREPEAT
./bin/ckProfiler reduce
$PRECISION
-D
256,14,14,256
-R
0,1,2
-O
$op
-I
$use_idx
-v
1 1
$NREPEAT
./bin/ckProfiler reduce
$PRECISION
-D
256,30,30,256
-R
0,1,2
-O
$op
-I
$use_idx
-v
1 1
$NREPEAT
./bin/ckProfiler reduce
$PRECISION
-D
256,56,56,256
-R
0,1,2
-O
$op
-I
$use_idx
-v
1 1
$NREPEAT
./bin/ckProfiler reduce
$PRECISION
-D
256,16,16,512
-R
0,1,2
-O
$op
-I
$use_idx
-v
1 1
$NREPEAT
./bin/ckProfiler reduce
$PRECISION
-D
256,28,28,512
-R
0,1,2
-O
$op
-I
$use_idx
-v
1 1
$NREPEAT
./bin/ckProfiler reduce
$PRECISION
-D
256,7,7,512
-R
0,1,2
-O
$op
-I
$use_idx
-v
1 1
$NREPEAT
./bin/ckProfiler reduce
$PRECISION
-D
256,56,56,64
-R
0,1,2
-O
$op
-I
$use_idx
-v
1 1
$NREPEAT
./bin/ckProfiler reduce
$PRECISION
-D
256,230,230,3
-R
0,1,2
-O
$op
-I
$use_idx
-v
1 1
$NREPEAT
./bin/ckProfiler reduce
$PRECISION
-D
128,14,14,1024
-R
0,1,2
-O
$op
-I
$use_idx
-v
1 1
$NREPEAT
./bin/ckProfiler reduce
$PRECISION
-D
128,28,28,128
-R
0,1,2
-O
$op
-I
$use_idx
-v
1 1
$NREPEAT
./bin/ckProfiler reduce
$PRECISION
-D
128,58,58,128
-R
0,1,2
-O
$op
-I
$use_idx
-v
1 1
$NREPEAT
./bin/ckProfiler reduce
$PRECISION
-D
128,7,7,2048
-R
0,1,2
-O
$op
-I
$use_idx
-v
1 1
$NREPEAT
./bin/ckProfiler reduce
$PRECISION
-D
128,14,14,256
-R
0,1,2
-O
$op
-I
$use_idx
-v
1 1
$NREPEAT
./bin/ckProfiler reduce
$PRECISION
-D
128,30,30,256
-R
0,1,2
-O
$op
-I
$use_idx
-v
1 1
$NREPEAT
./bin/ckProfiler reduce
$PRECISION
-D
128,56,56,256
-R
0,1,2
-O
$op
-I
$use_idx
-v
1 1
$NREPEAT
./bin/ckProfiler reduce
$PRECISION
-D
128,16,16,512
-R
0,1,2
-O
$op
-I
$use_idx
-v
1 1
$NREPEAT
./bin/ckProfiler reduce
$PRECISION
-D
128,28,28,512
-R
0,1,2
-O
$op
-I
$use_idx
-v
1 1
$NREPEAT
./bin/ckProfiler reduce
$PRECISION
-D
128,7,7,512
-R
0,1,2
-O
$op
-I
$use_idx
-v
1 1
$NREPEAT
./bin/ckProfiler reduce
$PRECISION
-D
128,56,56,64
-R
0,1,2
-O
$op
-I
$use_idx
-v
1 1
$NREPEAT
####### datatype layout reduce dims op use index verify init repeats
$driver
reduce
$PRECISION
-D
256,14,14,1024
-R
0,1,2
-O
$op
-I
$use_idx
$VERIFY
$INIT
$NREPEAT
$driver
reduce
$PRECISION
-D
256,28,28,128
-R
0,1,2
-O
$op
-I
$use_idx
$VERIFY
$INIT
$NREPEAT
$driver
reduce
$PRECISION
-D
256,58,58,128
-R
0,1,2
-O
$op
-I
$use_idx
$VERIFY
$INIT
$NREPEAT
$driver
reduce
$PRECISION
-D
256,7,7,2048
-R
0,1,2
-O
$op
-I
$use_idx
$VERIFY
$INIT
$NREPEAT
$driver
reduce
$PRECISION
-D
256,14,14,256
-R
0,1,2
-O
$op
-I
$use_idx
$VERIFY
$INIT
$NREPEAT
$driver
reduce
$PRECISION
-D
256,30,30,256
-R
0,1,2
-O
$op
-I
$use_idx
$VERIFY
$INIT
$NREPEAT
$driver
reduce
$PRECISION
-D
256,56,56,256
-R
0,1,2
-O
$op
-I
$use_idx
$VERIFY
$INIT
$NREPEAT
$driver
reduce
$PRECISION
-D
256,16,16,512
-R
0,1,2
-O
$op
-I
$use_idx
$VERIFY
$INIT
$NREPEAT
$driver
reduce
$PRECISION
-D
256,28,28,512
-R
0,1,2
-O
$op
-I
$use_idx
$VERIFY
$INIT
$NREPEAT
$driver
reduce
$PRECISION
-D
256,7,7,512
-R
0,1,2
-O
$op
-I
$use_idx
$VERIFY
$INIT
$NREPEAT
$driver
reduce
$PRECISION
-D
256,56,56,64
-R
0,1,2
-O
$op
-I
$use_idx
$VERIFY
$INIT
$NREPEAT
$driver
reduce
$PRECISION
-D
256,230,230,3
-R
0,1,2
-O
$op
-I
$use_idx
$VERIFY
$INIT
$NREPEAT
$driver
reduce
$PRECISION
-D
128,14,14,1024
-R
0,1,2
-O
$op
-I
$use_idx
$VERIFY
$INIT
$NREPEAT
$driver
reduce
$PRECISION
-D
128,28,28,128
-R
0,1,2
-O
$op
-I
$use_idx
$VERIFY
$INIT
$NREPEAT
$driver
reduce
$PRECISION
-D
128,58,58,128
-R
0,1,2
-O
$op
-I
$use_idx
$VERIFY
$INIT
$NREPEAT
$driver
reduce
$PRECISION
-D
128,7,7,2048
-R
0,1,2
-O
$op
-I
$use_idx
$VERIFY
$INIT
$NREPEAT
$driver
reduce
$PRECISION
-D
128,14,14,256
-R
0,1,2
-O
$op
-I
$use_idx
$VERIFY
$INIT
$NREPEAT
$driver
reduce
$PRECISION
-D
128,30,30,256
-R
0,1,2
-O
$op
-I
$use_idx
$VERIFY
$INIT
$NREPEAT
$driver
reduce
$PRECISION
-D
128,56,56,256
-R
0,1,2
-O
$op
-I
$use_idx
$VERIFY
$INIT
$NREPEAT
$driver
reduce
$PRECISION
-D
128,16,16,512
-R
0,1,2
-O
$op
-I
$use_idx
$VERIFY
$INIT
$NREPEAT
$driver
reduce
$PRECISION
-D
128,28,28,512
-R
0,1,2
-O
$op
-I
$use_idx
$VERIFY
$INIT
$NREPEAT
$driver
reduce
$PRECISION
-D
128,7,7,512
-R
0,1,2
-O
$op
-I
$use_idx
$VERIFY
$INIT
$NREPEAT
$driver
reduce
$PRECISION
-D
128,56,56,64
-R
0,1,2
-O
$op
-I
$use_idx
$VERIFY
$INIT
$NREPEAT
set
+x
done
done
...
...
script/test_convnd_fwd.sh
0 → 100644
View file @
dd6a8de4
#!/usr/bin/env bash
# set -e
DIM1
=
False
DIM2
=
True
DIM3
=
False
DATE
=
220317
GIT_HASH
=
4e6dfda
LOG_DIR
=
${
DATE
}
_
${
GIT_HASH
}
SUFFIX
=
${
GIT_HASH
}
#--------------------------------------------------------------------------
# Commandline arguments parsing
# like: cmd -key[--key] value
#--------------------------------------------------------------------------
POSITIONAL
=()
while
[[
$#
-gt
0
]]
do
key
=
"
$1
"
case
$key
in
-d1
|
--d1
)
DIM1
=
True
echo
DIM1:
"
${
DIM1
}
"
shift
# past argument
;;
-d2
|
--d2
)
DIM2
=
True
echo
DIM2:
"
${
DIM2
}
"
shift
# past argument
;;
-d3
|
--d3
)
DIM3
=
True
echo
DIM3:
"
${
DIM3
}
"
shift
# past argument
;;
-all
|
--all
)
DIM1
=
True
DIM2
=
True
DIM3
=
True
echo
DIM1:
"
${
DIM1
}
"
echo
DIM2:
"
${
DIM2
}
"
echo
DIM3:
"
${
DIM3
}
"
shift
# past argument
;;
-s
|
--suffix
)
SUFFIX
=
${
SUFFIX
}
_
"
$2
"
echo
SUFFIX:
"
${
SUFFIX
}
"
shift
# past argument
shift
# past value
;;
*
)
# unknown option
POSITIONAL+
=(
"
$1
"
)
# save it in an array for later
shift
# past argument
;;
esac
done
set
--
"
${
POSITIONAL
[@]
}
"
# restore positional parameters
#--------------------------------------------------------------------------
# NUMACTL="numactl --cpunodebind=1 --membind=1"
NUMACTL
=
# ENV_CONF=
GPU
=
mi100
PROF_ITER_COUNT
=
10000
LOG_DIR_PATH
=
../log/
${
LOG_DIR
}
set
-x
#-------------------------------------------------------------------------------
# 1D
#-------------------------------------------------------------------------------
if
[[
"
${
DIM1
}
"
==
"True"
]]
;
then
mkdir
-p
${
LOG_DIR_PATH
}
echo
">>>>>>>> RUN test conv1d nwc <<<<<<<<<<"
CMD
=
"./../build/bin/test_conv1d_fwd"
${
NUMACTL
}
${
CMD
}
2>&1
\
|
tee
${
LOG_DIR_PATH
}
/test_conv1d_fwd_nwc_
${
SUFFIX
}
_
${
GPU
}
.log
fi
#-------------------------------------------------------------------------------
# 2D
#-------------------------------------------------------------------------------
if
[[
"
${
DIM2
}
"
==
"True"
]]
;
then
mkdir
-p
${
LOG_DIR_PATH
}
echo
">>>>>>>> RUN test conv2d nhwc <<<<<<<<<<"
CMD
=
"./../build/bin/test_conv2d_fwd"
${
NUMACTL
}
${
CMD
}
2>&1
\
|
tee
${
LOG_DIR_PATH
}
/test_conv2d_fwd_nhwc_
${
SUFFIX
}
_
${
GPU
}
.log
fi
#-------------------------------------------------------------------------------
# 3D
#-------------------------------------------------------------------------------
if
[[
"
${
DIM3
}
"
==
"True"
]]
;
then
mkdir
-p
${
LOG_DIR_PATH
}
echo
">>>>>>>> RUN test conv3d ndhwc <<<<<<<<<<"
CMD
=
"./../build/bin/test_conv3d_fwd"
${
NUMACTL
}
${
CMD
}
2>&1
\
|
tee
${
LOG_DIR_PATH
}
/test_conv3d_fwd_ndhwc_
${
SUFFIX
}
_
${
GPU
}
.log
fi
script/test_reduce_no_index.sh
0 → 100755
View file @
dd6a8de4
#!/bin/bash
## The following will be used for CI
set
-x
## for float
bin/test_reduce_no_index
-D
64,4,280,82
-R
0,1,2,3 0 2
bin/test_reduce_no_index
-D
64,4,280,82
-R
0,1,2 0 2
bin/test_reduce_no_index
-D
64,4,280,82
-R
0,1,3 0 2
bin/test_reduce_no_index
-D
64,4,280,82
-R
0,2,3 0 2
bin/test_reduce_no_index
-D
64,4,280,82
-R
1,2,3 0 2
bin/test_reduce_no_index
-D
64,4,280,82
-R
0 0 2
bin/test_reduce_no_index
-D
64,4,280,82
-R
1 0 2
bin/test_reduce_no_index
-D
64,4,280,82
-R
2 0 2
bin/test_reduce_no_index
-D
64,4,280,82
-R
3 0 2
## for float16
bin/test_reduce_no_index
-D
64,4,280,82
-R
0,1,2,3 1 2
bin/test_reduce_no_index
-D
64,4,280,82
-R
0,1,2 1 2
bin/test_reduce_no_index
-D
64,4,280,82
-R
0,1,3 1 2
bin/test_reduce_no_index
-D
64,4,280,82
-R
0,2,3 1 2
bin/test_reduce_no_index
-D
64,4,280,82
-R
1,2,3 1 2
bin/test_reduce_no_index
-D
64,4,280,82
-R
0 1 2
bin/test_reduce_no_index
-D
64,4,280,82
-R
1 1 2
bin/test_reduce_no_index
-D
64,4,280,82
-R
2 1 2
bin/test_reduce_no_index
-D
64,4,280,82
-R
3 1 2
## for int8_t
bin/test_reduce_no_index
-D
64,4,280,82
-R
0,1,2,3 3 2
bin/test_reduce_no_index
-D
64,4,280,82
-R
0,1,2 3 2
bin/test_reduce_no_index
-D
64,4,280,82
-R
0,1,3 3 2
bin/test_reduce_no_index
-D
64,4,280,82
-R
0,2,3 3 2
bin/test_reduce_no_index
-D
64,4,280,82
-R
1,2,3 3 2
bin/test_reduce_no_index
-D
64,4,280,82
-R
0 3 2
bin/test_reduce_no_index
-D
64,4,280,82
-R
1 3 2
bin/test_reduce_no_index
-D
64,4,280,82
-R
2 3 2
bin/test_reduce_no_index
-D
64,4,280,82
-R
3 3 2
## for bfloat16
bin/test_reduce_no_index
-D
64,4,280,82
-R
0,1,2,3 5 2
bin/test_reduce_no_index
-D
64,4,280,82
-R
0,1,2 5 2
bin/test_reduce_no_index
-D
64,4,280,82
-R
0,1,3 5 2
bin/test_reduce_no_index
-D
64,4,280,82
-R
0,2,3 5 2
bin/test_reduce_no_index
-D
64,4,280,82
-R
1,2,3 5 2
bin/test_reduce_no_index
-D
64,4,280,82
-R
0 5 2
bin/test_reduce_no_index
-D
64,4,280,82
-R
1 5 2
bin/test_reduce_no_index
-D
64,4,280,82
-R
2 5 2
bin/test_reduce_no_index
-D
64,4,280,82
-R
3 5 2
set
+x
script/test_reduce_with_index.sh
0 → 100755
View file @
dd6a8de4
#!/bin/bash
## The following will be used for CI
set
-x
## for float
bin/test_reduce_with_index
-D
64,4,280,82
-R
0,1,2,3 0 2
bin/test_reduce_with_index
-D
64,4,280,82
-R
0,1,2 0 2
bin/test_reduce_with_index
-D
64,4,280,82
-R
0,1,3 0 2
bin/test_reduce_with_index
-D
64,4,280,82
-R
0,2,3 0 2
bin/test_reduce_with_index
-D
64,4,280,82
-R
1,2,3 0 2
bin/test_reduce_with_index
-D
64,4,280,82
-R
0 0 2
bin/test_reduce_with_index
-D
64,4,280,82
-R
1 0 2
bin/test_reduce_with_index
-D
64,4,280,82
-R
2 0 2
bin/test_reduce_with_index
-D
64,4,280,82
-R
3 0 2
## for float16
bin/test_reduce_with_index
-D
64,4,280,82
-R
0,1,2,3 1 2
bin/test_reduce_with_index
-D
64,4,280,82
-R
0,1,2 1 2
bin/test_reduce_with_index
-D
64,4,280,82
-R
0,1,3 1 2
bin/test_reduce_with_index
-D
64,4,280,82
-R
0,2,3 1 2
bin/test_reduce_with_index
-D
64,4,280,82
-R
1,2,3 1 2
bin/test_reduce_with_index
-D
64,4,280,82
-R
0 1 2
bin/test_reduce_with_index
-D
64,4,280,82
-R
1 1 2
bin/test_reduce_with_index
-D
64,4,280,82
-R
2 1 2
bin/test_reduce_with_index
-D
64,4,280,82
-R
3 1 2
## for int8_t
bin/test_reduce_with_index
-D
64,4,280,82
-R
0,1,2,3 3 2
bin/test_reduce_with_index
-D
64,4,280,82
-R
0,1,2 3 2
bin/test_reduce_with_index
-D
64,4,280,82
-R
0,1,3 3 2
bin/test_reduce_with_index
-D
64,4,280,82
-R
0,2,3 3 2
bin/test_reduce_with_index
-D
64,4,280,82
-R
1,2,3 3 2
bin/test_reduce_with_index
-D
64,4,280,82
-R
0 3 2
bin/test_reduce_with_index
-D
64,4,280,82
-R
1 3 2
bin/test_reduce_with_index
-D
64,4,280,82
-R
2 3 2
bin/test_reduce_with_index
-D
64,4,280,82
-R
3 3 2
## for bfloat16
bin/test_reduce_with_index
-D
64,4,280,82
-R
0,1,2,3 5 2
bin/test_reduce_with_index
-D
64,4,280,82
-R
0,1,2 5 2
bin/test_reduce_with_index
-D
64,4,280,82
-R
0,1,3 5 2
bin/test_reduce_with_index
-D
64,4,280,82
-R
0,2,3 5 2
bin/test_reduce_with_index
-D
64,4,280,82
-R
1,2,3 5 2
bin/test_reduce_with_index
-D
64,4,280,82
-R
0 5 2
bin/test_reduce_with_index
-D
64,4,280,82
-R
1 5 2
bin/test_reduce_with_index
-D
64,4,280,82
-R
2 5 2
bin/test_reduce_with_index
-D
64,4,280,82
-R
3 5 2
set
+x
test/CMakeLists.txt
View file @
dd6a8de4
...
...
@@ -15,7 +15,9 @@ include_directories(BEFORE
${
PROJECT_SOURCE_DIR
}
/library/include/ck/library/tensor_operation_instance/gpu/reduce
${
PROJECT_SOURCE_DIR
}
/library/include/ck/library/reference_tensor_operation/cpu
${
PROJECT_SOURCE_DIR
}
/library/include/ck/library/reference_tensor_operation/gpu
${
PROJECT_SOURCE_DIR
}
/library/include/ck/library/utility
${
PROJECT_SOURCE_DIR
}
/test/include
${
PROJECT_SOURCE_DIR
}
/profiler/include
${
PROJECT_SOURCE_DIR
}
/external/include/half
)
...
...
@@ -37,6 +39,10 @@ add_subdirectory(conv_util)
add_subdirectory
(
reference_conv_fwd
)
add_subdirectory
(
gemm
)
add_subdirectory
(
gemm_split_k
)
add_subdirectory
(
conv2d_fwd
)
add_subdirectory
(
gemm_reduce
)
add_subdirectory
(
batched_gemm
)
add_subdirectory
(
batched_gemm_reduce
)
add_subdirectory
(
grouped_gemm
)
add_subdirectory
(
convnd_fwd
)
add_subdirectory
(
conv2d_bwd_data
)
add_subdirectory
(
reduce
)
add_subdirectory
(
conv2d_bwd_weight
)
test/batched_gemm/CMakeLists.txt
0 → 100644
View file @
dd6a8de4
add_test_executable
(
test_batched_gemm_fp16 batched_gemm_fp16.cpp
)
target_link_libraries
(
test_batched_gemm_fp16 PRIVATE host_tensor
)
target_link_libraries
(
test_batched_gemm_fp16 PRIVATE device_batched_gemm_instance
)
test/batched_gemm/batched_gemm_fp16.cpp
0 → 100644
View file @
dd6a8de4
#include <iostream>
#include "profile_batched_gemm_impl.hpp"
namespace
{
using
ADataType
=
ck
::
half_t
;
using
BDataType
=
ck
::
half_t
;
using
CDataType
=
ck
::
half_t
;
using
Row
=
ck
::
tensor_layout
::
gemm
::
RowMajor
;
using
Col
=
ck
::
tensor_layout
::
gemm
::
ColumnMajor
;
}
// namespace
int
main
()
{
int
M
=
512
;
int
N
=
256
;
int
K
=
128
;
int
BatchCount
=
3
;
bool
pass
=
true
;
pass
=
pass
&&
ck
::
profiler
::
profile_batched_gemm_impl
<
ADataType
,
BDataType
,
CDataType
,
Row
,
Row
,
Row
>
(
true
,
1
,
false
,
1
,
M
,
N
,
K
,
K
,
N
,
N
,
BatchCount
);
pass
=
pass
&&
ck
::
profiler
::
profile_batched_gemm_impl
<
ADataType
,
BDataType
,
CDataType
,
Row
,
Col
,
Row
>
(
true
,
1
,
false
,
1
,
M
,
N
,
K
,
K
,
K
,
N
,
BatchCount
);
pass
=
pass
&&
ck
::
profiler
::
profile_batched_gemm_impl
<
ADataType
,
BDataType
,
CDataType
,
Col
,
Row
,
Row
>
(
true
,
1
,
false
,
1
,
M
,
N
,
K
,
M
,
N
,
N
,
BatchCount
);
pass
=
pass
&&
ck
::
profiler
::
profile_batched_gemm_impl
<
ADataType
,
BDataType
,
CDataType
,
Col
,
Col
,
Row
>
(
true
,
1
,
false
,
1
,
M
,
N
,
K
,
M
,
K
,
N
,
BatchCount
);
std
::
cout
<<
"test BatchedGEMM fp16: "
<<
(
pass
?
"Pass"
:
"Fail"
)
<<
std
::
endl
;
return
pass
?
0
:
1
;
}
test/batched_gemm/batched_gemm_util.hpp
0 → 100644
View file @
dd6a8de4
#ifndef BATCHED_GEMM_UTILS_HPP
#define BATCHED_GEMM_UTILS_HPP
#include "config.hpp"
#include "device.hpp"
#include "host_tensor.hpp"
namespace
ck
{
namespace
batched_gemm_util
{
struct
GemmParams
{
GemmParams
()
:
M
(
1024
),
N
(
1024
),
K
(
1024
),
StrideA
(
1024
),
StrideB
(
1024
),
StrideC
(
1024
),
alpha
(
1
),
beta
(
0
)
{
}
ck
::
index_t
M
;
ck
::
index_t
N
;
ck
::
index_t
K
;
ck
::
index_t
StrideA
;
ck
::
index_t
StrideB
;
ck
::
index_t
StrideC
;
float
alpha
;
float
beta
;
};
template
<
typename
BatchedGemmInstance
,
typename
ADataType
,
typename
BDataType
,
typename
CDataType
,
typename
AElementwiseOperation
,
typename
BElementwiseOperation
,
typename
CElementwiseOperation
>
void
RunHostBatchedGemm
(
const
Tensor
<
ADataType
>&
A
,
const
Tensor
<
BDataType
>&
B
,
Tensor
<
CDataType
>&
C
,
AElementwiseOperation
a_element_op
,
BElementwiseOperation
b_element_op
,
CElementwiseOperation
c_element_op
)
{
auto
ref_batched_gemm
=
BatchedGemmInstance
{};
auto
ref_invoker
=
ref_batched_gemm
.
MakeInvoker
();
auto
ref_argument
=
ref_batched_gemm
.
MakeArgument
(
A
,
B
,
C
,
a_element_op
,
b_element_op
,
c_element_op
);
ref_invoker
.
Run
(
ref_argument
);
}
template
<
typename
DeviceGemmPtr
,
typename
ADataType
,
typename
BDataType
,
typename
CDataType
,
typename
AElementwiseOperation
,
typename
BElementwiseOperation
,
typename
CElementwiseOperation
>
void
RunDeviceBatchedGemm
(
DeviceGemmPtr
&
batched_gemm_ptr
,
const
ck
::
batched_gemm_util
::
GemmParams
&
params
,
const
Tensor
<
ADataType
>&
A
,
const
Tensor
<
BDataType
>&
B
,
Tensor
<
CDataType
>&
C
,
AElementwiseOperation
a_element_op
,
BElementwiseOperation
b_element_op
,
CElementwiseOperation
c_element_op
)
{
DeviceMem
a_g_m_k_device_buf
(
sizeof
(
ADataType
)
*
A
.
mDesc
.
GetElementSpace
());
DeviceMem
b_g_k_n_device_buf
(
sizeof
(
BDataType
)
*
B
.
mDesc
.
GetElementSpace
());
DeviceMem
c_g_m_n_device_buf
(
sizeof
(
CDataType
)
*
C
.
mDesc
.
GetElementSpace
());
a_g_m_k_device_buf
.
ToDevice
(
A
.
mData
.
data
());
b_g_k_n_device_buf
.
ToDevice
(
B
.
mData
.
data
());
const
auto
batch_count
=
A
.
mDesc
.
GetLengths
()[
0
];
auto
invoker_ptr
=
batched_gemm_ptr
->
MakeInvokerPointer
();
auto
argument_ptr
=
batched_gemm_ptr
->
MakeArgumentPointer
(
static_cast
<
ADataType
*>
(
a_g_m_k_device_buf
.
GetDeviceBuffer
()),
static_cast
<
BDataType
*>
(
b_g_k_n_device_buf
.
GetDeviceBuffer
()),
static_cast
<
CDataType
*>
(
c_g_m_n_device_buf
.
GetDeviceBuffer
()),
params
.
M
,
params
.
N
,
params
.
K
,
params
.
StrideA
,
params
.
StrideB
,
params
.
StrideC
,
a_element_op
,
b_element_op
,
c_element_op
,
batch_count
);
if
(
!
batched_gemm_ptr
->
IsSupportedArgument
(
argument_ptr
.
get
()))
{
throw
std
::
runtime_error
(
"wrong! device_gemm with the specified compilation parameters does "
"not support this GEMM problem"
);
}
invoker_ptr
->
Run
(
argument_ptr
.
get
());
c_g_m_n_device_buf
.
FromDevice
(
C
.
mData
.
data
());
}
}
// namespace batched_gemm_util
}
// namespace ck
#endif
test/batched_gemm_reduce/CMakeLists.txt
0 → 100644
View file @
dd6a8de4
include_directories
(
BEFORE
${
PROJECT_SOURCE_DIR
}
/profiler/include
${
PROJECT_SOURCE_DIR
}
/test/include
${
PROJECT_SOURCE_DIR
}
/external/include/half
)
add_test_executable
(
test_batched_gemm_reduce_fp16 batched_gemm_reduce_fp16.cpp
)
target_link_libraries
(
test_batched_gemm_reduce_fp16 PRIVATE host_tensor
)
target_link_libraries
(
test_batched_gemm_reduce_fp16 PRIVATE device_batched_gemm_reduce_instance
)
test/batched_gemm_reduce/batched_gemm_reduce_fp16.cpp
0 → 100644
View file @
dd6a8de4
#include <iostream>
#include "profile_batched_gemm_reduce_impl.hpp"
int
main
()
{
using
Row
=
ck
::
tensor_layout
::
gemm
::
RowMajor
;
using
Col
=
ck
::
tensor_layout
::
gemm
::
ColumnMajor
;
int
M
=
512
;
int
N
=
256
;
int
K
=
128
;
int
BatchCount
=
3
;
bool
pass
=
true
;
pass
=
pass
&&
ck
::
profiler
::
profile_batched_gemm_reduce_impl
<
ck
::
half_t
,
ck
::
half_t
,
ck
::
half_t
,
float
,
Row
,
Row
,
Row
>
(
true
,
1
,
false
,
1
,
M
,
N
,
K
,
K
,
N
,
N
,
BatchCount
);
pass
=
pass
&&
ck
::
profiler
::
profile_batched_gemm_reduce_impl
<
ck
::
half_t
,
ck
::
half_t
,
ck
::
half_t
,
float
,
Row
,
Col
,
Row
>
(
true
,
1
,
false
,
1
,
M
,
N
,
K
,
K
,
K
,
N
,
BatchCount
);
pass
=
pass
&&
ck
::
profiler
::
profile_batched_gemm_reduce_impl
<
ck
::
half_t
,
ck
::
half_t
,
ck
::
half_t
,
float
,
Col
,
Row
,
Row
>
(
true
,
1
,
false
,
1
,
M
,
N
,
K
,
M
,
N
,
N
,
BatchCount
);
pass
=
pass
&&
ck
::
profiler
::
profile_batched_gemm_reduce_impl
<
ck
::
half_t
,
ck
::
half_t
,
ck
::
half_t
,
float
,
Col
,
Col
,
Row
>
(
true
,
1
,
false
,
1
,
M
,
N
,
K
,
M
,
K
,
N
,
BatchCount
);
if
(
pass
)
{
std
::
cout
<<
"test BatchedGEMM+Reduce fp16: Pass"
<<
std
::
endl
;
return
0
;
}
else
{
std
::
cout
<<
"test BatchedGEMM+Reduce fp16: Fail"
<<
std
::
endl
;
return
-
1
;
}
}
test/conv2d_bwd_data/conv2d_bwd_data.cpp
View file @
dd6a8de4
...
...
@@ -121,15 +121,17 @@ int main(int argc, char* argv[])
exit
(
1
);
}
auto
Run
=
[
&
](
auto
input_type
,
auto
wei_type
,
auto
out_type
)
{
auto
Run
=
[
&
](
auto
input_type
,
auto
wei_type
,
auto
out_type
,
auto
acc_type
)
{
using
InDataType
=
decltype
(
input_type
);
using
WeiDataType
=
decltype
(
wei_type
);
using
OutDataType
=
decltype
(
out_type
);
using
AccDataType
=
decltype
(
acc_type
);
using
ReferenceConvBwdInstance
=
ck
::
tensor_operation
::
host
::
ReferenceConvBwdData
<
InDataType
,
WeiDataType
,
OutDataType
,
AccDataType
,
InElementOp
,
WeiElementOp
,
OutElementOp
>
;
...
...
@@ -182,8 +184,8 @@ int main(int argc, char* argv[])
out_device_buf
.
ToDevice
(
out_n_k_ho_wo
.
mData
.
data
());
wei_device_buf
.
ToDevice
(
wei_k_c_y_x
.
mData
.
data
());
in_n_c_hi_wi_device_result
.
GenerateTensorValue
(
GeneratorTensor_1
<
InDataType
>
{
5
});
// reset input to zero
in_n_c_hi_wi_device_result
.
GenerateTensorValue
(
GeneratorTensor_1
<
InDataType
>
{
0
});
in_device_buf
.
ToDevice
(
in_n_c_hi_wi_device_result
.
mData
.
data
());
// get host result
...
...
@@ -225,9 +227,9 @@ int main(int argc, char* argv[])
ck
::
tensor_operation
::
device
::
device_conv2d_bwd_data_instance
::
add_device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f16_instances
(
conv_ptrs
);
}
else
if
constexpr
(
ck
::
is_same_v
<
ck
::
remove_cv_t
<
InDataType
>
,
ushor
t
>
&&
ck
::
is_same_v
<
ck
::
remove_cv_t
<
WeiDataType
>
,
ushor
t
>
&&
ck
::
is_same_v
<
ck
::
remove_cv_t
<
OutDataType
>
,
ushor
t
>
)
else
if
constexpr
(
ck
::
is_same_v
<
ck
::
remove_cv_t
<
InDataType
>
,
ck
::
bhalf_
t
>
&&
ck
::
is_same_v
<
ck
::
remove_cv_t
<
WeiDataType
>
,
ck
::
bhalf_
t
>
&&
ck
::
is_same_v
<
ck
::
remove_cv_t
<
OutDataType
>
,
ck
::
bhalf_
t
>
)
{
ck
::
tensor_operation
::
device
::
device_conv2d_bwd_data_instance
::
add_device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_bf16_instances
(
conv_ptrs
);
...
...
@@ -293,33 +295,33 @@ int main(int argc, char* argv[])
if
(
success
)
{
std
::
cout
<<
"test conv2d bwd : Pass"
<<
std
::
endl
;
return
0
;
}
else
{
std
::
cout
<<
"test conv2d bwd: Fail "
<<
std
::
endl
;
return
-
1
;
}
};
if
(
data_type
==
0
)
{
Run
(
F32
(),
F32
(),
F32
());
return
Run
(
F32
(),
F32
(),
F32
(),
F32
());
}
else
if
(
data_type
==
1
)
{
Run
(
F16
(),
F16
(),
F16
());
return
Run
(
F16
(),
F16
(),
F16
()
,
F32
()
);
}
else
if
(
data_type
==
2
)
{
Run
(
BF16
(),
BF16
(),
BF16
());
return
Run
(
BF16
(),
BF16
(),
BF16
()
,
F32
()
);
}
else
if
(
data_type
==
3
)
{
Run
(
INT8
(),
INT8
(),
INT8
());
return
Run
(
INT8
(),
INT8
(),
INT8
()
,
int
()
);
}
else
{
return
1
;
}
return
0
;
}
test/conv2d_bwd_weight/CMakeLists.txt
0 → 100644
View file @
dd6a8de4
include_directories
(
BEFORE
${
PROJECT_SOURCE_DIR
}
/profiler/include
${
PROJECT_SOURCE_DIR
}
/external/include/half
)
add_test_executable
(
test_conv2d_bwd_weight conv2d_bwd_weight.cpp
)
target_link_libraries
(
test_conv2d_bwd_weight PRIVATE host_tensor
)
target_link_libraries
(
test_conv2d_bwd_weight PRIVATE device_conv2d_bwd_weight_instance
)
test/conv2d_bwd_weight/conv2d_bwd_weight.cpp
0 → 100644
View file @
dd6a8de4
#include <iostream>
#include <numeric>
#include <initializer_list>
#include <cstdlib>
#include <stdlib.h>
#include <half.hpp>
#include <vector>
#include "conv_fwd_util.hpp"
#include "profile_conv_bwd_weight_impl.hpp"
int
test_self
()
{
bool
pass
=
true
;
std
::
vector
<
ck
::
utils
::
conv
::
ConvParams
>
params
;
params
.
push_back
({
2
,
128
,
256
,
256
,
{
1
,
1
},
{
7
,
7
},
{
2
,
2
},
{
1
,
1
},
{
0
,
0
},
{
0
,
0
}});
params
.
push_back
({
2
,
128
,
256
,
256
,
{
3
,
3
},
{
14
,
14
},
{
1
,
1
},
{
1
,
1
},
{
1
,
1
},
{
1
,
1
}});
params
.
push_back
({
2
,
128
,
256
,
256
,
{
1
,
1
},
{
3
,
3
},
{
1
,
1
},
{
1
,
1
},
{
0
,
0
},
{
0
,
0
}});
for
(
auto
&
param
:
params
)
{
// f32
pass
&=
ck
::
profiler
::
profile_conv_bwd_weight_impl
<
2
,
float
,
float
,
float
,
ck
::
tensor_layout
::
convolution
::
NHWC
,
ck
::
tensor_layout
::
convolution
::
KYXC
,
ck
::
tensor_layout
::
convolution
::
NHWK
>
(
1
,
// do_verification,
1
,
// init_method,
0
,
// do_log,
1
,
// nrepeat,
param
.
N
,
param
.
K
,
param
.
C
,
param
.
input_spatial_lengths
,
param
.
filter_spatial_lengths
,
param
.
GetOutputSpatialLengths
(),
param
.
conv_filter_strides
,
param
.
conv_filter_dilations
,
param
.
input_left_pads
,
param
.
input_right_pads
,
2
);
// fp16
pass
&=
ck
::
profiler
::
profile_conv_bwd_weight_impl
<
2
,
ck
::
half_t
,
ck
::
half_t
,
ck
::
half_t
,
ck
::
tensor_layout
::
convolution
::
NHWC
,
ck
::
tensor_layout
::
convolution
::
KYXC
,
ck
::
tensor_layout
::
convolution
::
NHWK
>
(
1
,
// do_verification,
1
,
// init_method,
0
,
// do_log,
1
,
// nrepeat,
param
.
N
,
param
.
K
,
param
.
C
,
param
.
input_spatial_lengths
,
param
.
filter_spatial_lengths
,
param
.
GetOutputSpatialLengths
(),
param
.
conv_filter_strides
,
param
.
conv_filter_dilations
,
param
.
input_left_pads
,
param
.
input_right_pads
,
2
);
}
return
pass
;
}
int
main
(
int
argc
,
char
*
argv
[])
{
int
data_type
=
0
;
int
init_method
=
0
;
// Conv shape
ck
::
index_t
N
=
128
;
ck
::
index_t
K
=
256
;
ck
::
index_t
C
=
192
;
ck
::
index_t
Y
=
3
;
ck
::
index_t
X
=
3
;
ck
::
index_t
Hi
=
71
;
ck
::
index_t
Wi
=
71
;
ck
::
index_t
conv_stride_h
=
2
;
ck
::
index_t
conv_stride_w
=
2
;
ck
::
index_t
conv_dilation_h
=
1
;
ck
::
index_t
conv_dilation_w
=
1
;
ck
::
index_t
in_left_pad_h
=
1
;
ck
::
index_t
in_left_pad_w
=
1
;
ck
::
index_t
in_right_pad_h
=
1
;
ck
::
index_t
in_right_pad_w
=
1
;
ck
::
index_t
split_k
=
1
;
bool
pass
=
true
;
if
(
argc
==
1
)
{
pass
=
test_self
();
}
else
{
if
(
argc
==
3
)
{
data_type
=
std
::
stoi
(
argv
[
1
]);
init_method
=
std
::
stoi
(
argv
[
2
]);
}
else
if
(
argc
==
19
)
{
data_type
=
std
::
stoi
(
argv
[
1
]);
init_method
=
std
::
stoi
(
argv
[
2
]);
N
=
std
::
stoi
(
argv
[
3
]);
K
=
std
::
stoi
(
argv
[
4
]);
C
=
std
::
stoi
(
argv
[
5
]);
Y
=
std
::
stoi
(
argv
[
6
]);
X
=
std
::
stoi
(
argv
[
7
]);
Hi
=
std
::
stoi
(
argv
[
8
]);
Wi
=
std
::
stoi
(
argv
[
9
]);
conv_stride_h
=
std
::
stoi
(
argv
[
10
]);
conv_stride_w
=
std
::
stoi
(
argv
[
11
]);
conv_dilation_h
=
std
::
stoi
(
argv
[
12
]);
conv_dilation_w
=
std
::
stoi
(
argv
[
13
]);
in_left_pad_h
=
std
::
stoi
(
argv
[
14
]);
in_left_pad_w
=
std
::
stoi
(
argv
[
15
]);
in_right_pad_h
=
std
::
stoi
(
argv
[
16
]);
in_right_pad_w
=
std
::
stoi
(
argv
[
17
]);
split_k
=
std
::
stoi
(
argv
[
18
]);
}
else
{
printf
(
"arg1: data type (0=fp32, 1=fp16, 2= bfp16, 3= int8_t )
\n
"
);
printf
(
"arg2: initialization (0=no init, 1=integer value, 2=decimal value)
\n
"
);
printf
(
"arg3 to 17: N, K, C, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, "
"RightPx
\n
"
);
exit
(
1
);
}
ck
::
utils
::
conv
::
ConvParams
param
{
2
,
N
,
K
,
C
,
{
Y
,
X
},
{
Hi
,
Wi
},
{
conv_stride_h
,
conv_stride_w
},
{
conv_dilation_h
,
conv_dilation_w
},
{
in_left_pad_h
,
in_left_pad_w
},
{
in_right_pad_h
,
in_right_pad_w
}};
if
(
data_type
==
0
)
{
pass
=
ck
::
profiler
::
profile_conv_bwd_weight_impl
<
2
,
float
,
float
,
float
,
ck
::
tensor_layout
::
convolution
::
NHWC
,
ck
::
tensor_layout
::
convolution
::
KYXC
,
ck
::
tensor_layout
::
convolution
::
NHWK
>
(
1
,
init_method
,
0
,
1
,
param
.
N
,
param
.
K
,
param
.
C
,
param
.
input_spatial_lengths
,
param
.
filter_spatial_lengths
,
param
.
GetOutputSpatialLengths
(),
param
.
conv_filter_strides
,
param
.
conv_filter_dilations
,
param
.
input_left_pads
,
param
.
input_right_pads
,
split_k
);
}
else
if
(
data_type
==
1
)
{
pass
=
ck
::
profiler
::
profile_conv_bwd_weight_impl
<
2
,
ck
::
half_t
,
ck
::
half_t
,
ck
::
half_t
,
ck
::
tensor_layout
::
convolution
::
NHWC
,
ck
::
tensor_layout
::
convolution
::
KYXC
,
ck
::
tensor_layout
::
convolution
::
NHWK
>
(
1
,
init_method
,
0
,
1
,
param
.
N
,
param
.
K
,
param
.
C
,
param
.
input_spatial_lengths
,
param
.
filter_spatial_lengths
,
param
.
GetOutputSpatialLengths
(),
param
.
conv_filter_strides
,
param
.
conv_filter_dilations
,
param
.
input_left_pads
,
param
.
input_right_pads
,
split_k
);
}
else
{
std
::
cout
<<
"Not support data type"
<<
std
::
endl
;
return
1
;
}
}
if
(
pass
)
{
std
::
cout
<<
"test conv2d bwd weight : Pass"
<<
std
::
endl
;
return
0
;
}
else
{
std
::
cout
<<
"test conv2d bwd weight: Fail "
<<
std
::
endl
;
return
-
1
;
}
}
Prev
1
…
18
19
20
21
22
23
24
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment