Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
8c4e33f1
Commit
8c4e33f1
authored
Nov 15, 2021
by
Chao Liu
Browse files
Merge remote-tracking branch 'origin/develop' into v5r1_add
parents
5aed38d4
3737bb03
Changes
36
Show whitespace changes
Inline
Side-by-side
Showing
16 changed files
with
556 additions
and
135 deletions
+556
-135
composable_kernel/include/utility/tuple_helper.hpp
composable_kernel/include/utility/tuple_helper.hpp
+7
-16
composable_kernel/include/utility/type.hpp
composable_kernel/include/utility/type.hpp
+0
-15
device_operation/include/device_gemm_xdl.hpp
device_operation/include/device_gemm_xdl.hpp
+0
-1
device_operation/include/gemm_common.hpp
device_operation/include/gemm_common.hpp
+0
-22
external/rocm/include/bfloat16_dev.hpp
external/rocm/include/bfloat16_dev.hpp
+1
-1
host/driver_offline/include/device_convolution_backward_data_implicit_gemm_v4r1r2_xdlops_nhwc_kyxc_nhwk_1x1.hpp
...d_data_implicit_gemm_v4r1r2_xdlops_nhwc_kyxc_nhwk_1x1.hpp
+2
-2
host/driver_offline/src/conv_bwd_driver_offline.cpp
host/driver_offline/src/conv_bwd_driver_offline.cpp
+12
-12
host/driver_offline/src/conv_fwd_driver_offline.cpp
host/driver_offline/src/conv_fwd_driver_offline.cpp
+54
-19
host/driver_offline/src/conv_wrw_driver_offline.cpp
host/driver_offline/src/conv_wrw_driver_offline.cpp
+12
-12
host/driver_offline/src/gemm_driver_offline.cpp
host/driver_offline/src/gemm_driver_offline.cpp
+182
-12
host/host_tensor/include/host_gemm.hpp
host/host_tensor/include/host_gemm.hpp
+21
-22
host/host_tensor/include/host_tensor.hpp
host/host_tensor/include/host_tensor.hpp
+37
-0
host/host_tensor/include/host_tensor_generator.hpp
host/host_tensor/include/host_tensor_generator.hpp
+86
-0
profiler/gemm_profiler.cpp
profiler/gemm_profiler.cpp
+18
-1
script/profile_conv.sh
script/profile_conv.sh
+100
-0
script/profile_gemm.sh
script/profile_gemm.sh
+24
-0
No files found.
composable_kernel/include/utility/tuple_helper.hpp
View file @
8c4e33f1
...
...
@@ -6,22 +6,6 @@
namespace
ck
{
template
<
typename
...
Ts
>
struct
is_known_at_compile_time
<
Tuple
<
Ts
...
>>
{
__host__
__device__
static
constexpr
bool
IsKnownAtCompileTime
()
{
return
container_reduce
(
Tuple
<
Ts
...
>
{},
[](
auto
x
,
bool
r
)
{
return
is_known_at_compile_time
<
remove_cvref_t
<
decltype
(
x
)
>>::
value
&
r
;
},
true
);
}
static
constexpr
bool
value
=
IsKnownAtCompileTime
();
};
template
<
typename
F
,
index_t
N
>
__host__
__device__
constexpr
auto
generate_tuple
(
F
&&
f
,
Number
<
N
>
)
{
...
...
@@ -29,6 +13,13 @@ __host__ __device__ constexpr auto generate_tuple(F&& f, Number<N>)
typename
arithmetic_sequence_gen
<
0
,
N
,
1
>::
type
{});
}
template
<
typename
F
,
index_t
N
>
__host__
__device__
constexpr
auto
generate_tie
(
F
&&
f
,
Number
<
N
>
)
{
return
unpack
([
&
f
](
auto
&&
...
xs
)
{
return
tie
(
f
(
xs
)...);
},
typename
arithmetic_sequence_gen
<
0
,
N
,
1
>::
type
{});
}
namespace
detail
{
template
<
typename
F
,
typename
X
,
index_t
...
Is
>
...
...
composable_kernel/include/utility/type.hpp
View file @
8c4e33f1
...
...
@@ -31,21 +31,6 @@ using remove_cvref_t = remove_cv_t<std::remove_reference_t<T>>;
template
<
typename
T
>
inline
constexpr
bool
is_pointer_v
=
std
::
is_pointer
<
T
>::
value
;
template
<
typename
T
>
struct
is_known_at_compile_time
;
template
<
>
struct
is_known_at_compile_time
<
index_t
>
{
static
constexpr
bool
value
=
false
;
};
template
<
typename
T
,
T
X
>
struct
is_known_at_compile_time
<
integral_constant
<
T
,
X
>>
{
static
constexpr
bool
value
=
true
;
};
template
<
typename
Y
,
typename
X
,
typename
enable_if
<
sizeof
(
X
)
==
sizeof
(
Y
),
bool
>
::
type
=
false
>
__host__
__device__
constexpr
Y
as_type
(
X
x
)
{
...
...
device_operation/include/device_gemm_xdl.hpp
View file @
8c4e33f1
...
...
@@ -3,7 +3,6 @@
#include <iostream>
#include "device.hpp"
#include "gemm_common.hpp"
#include "device_base.hpp"
#include "device_gemm.hpp"
#include "common_header.hpp"
...
...
device_operation/include/gemm_common.hpp
deleted
100644 → 0
View file @
5aed38d4
#ifndef GEMM_COMMON_HPP
#define GEMM_COMMON_HPP
enum
GemmMatrixLayout
{
MK_KN_MN
,
// 0
MK_NK_MN
,
// 1
KM_KN_MN
,
// 2
KM_NK_MN
,
// 3
MK_KN_NM
,
// 4
MK_NK_NM
,
// 5
KM_KN_NM
,
// 6
KM_NK_NM
,
// 7
};
enum
GemmDataType
{
F32_F32_F32
,
// 0
F16_F16_F16
,
// 1
};
#endif
external/rocm/include/bfloat16_dev.hpp
View file @
8c4e33f1
...
...
@@ -31,7 +31,7 @@ extern "C" {
#endif
#ifdef __HIP_PLATFORM_HCC__
#define EXECUTION_SPECIFIER __device__
#define EXECUTION_SPECIFIER __device__
__host__
#else
#define EXECUTION_SPECIFIER
#endif // MIOPEN_BACKEND_HIP
...
...
host/driver_offline/include/device_convolution_backward_data_implicit_gemm_v4r1r2_xdlops_nhwc_kyxc_nhwk_1x1.hpp
View file @
8c4e33f1
...
...
@@ -104,7 +104,7 @@ void device_convolution_backward_data_implicit_gemm_v4r1r2_xdlops_nhwc_kyxc_nhwk
constexpr
index_t
GemmBBlockTransferDstScalarPerVector_GemmK1
=
4
;
constexpr
index_t
GemmCThreadTransferDstScalarPerVector
=
1
;
#elif
1
#elif
0
// [M, N, K0, K1] = [256, 128, 4, 8], C = 128, for fp16
constexpr
index_t
BlockSize
=
256
;
...
...
@@ -132,7 +132,7 @@ void device_convolution_backward_data_implicit_gemm_v4r1r2_xdlops_nhwc_kyxc_nhwk
constexpr
index_t
GemmBBlockTransferDstScalarPerVector_GemmK1
=
8
;
constexpr
index_t
GemmCThreadTransferDstScalarPerVector
=
1
;
#elif
0
#elif
1
// [M, N, K0, K1] = [128, 256, 4, 8], C = 128, for fp16
constexpr
index_t
BlockSize
=
256
;
...
...
host/driver_offline/src/conv_bwd_driver_offline.cpp
View file @
8c4e33f1
...
...
@@ -325,30 +325,30 @@ int main(int argc, char* argv[])
// no initialization
break
;
case
1
:
out
.
GenerateTensorValue
(
GeneratorTensor_1
{},
num_thread
);
wei
.
GenerateTensorValue
(
GeneratorTensor_1
{},
num_thread
);
out
.
GenerateTensorValue
(
GeneratorTensor_1
<
out_data_t
>
{},
num_thread
);
wei
.
GenerateTensorValue
(
GeneratorTensor_1
<
in_data_t
>
{},
num_thread
);
break
;
case
2
:
out
.
GenerateTensorValue
(
GeneratorTensor_1
{},
num_thread
);
wei
.
GenerateTensorValue
(
GeneratorTensor_2
{
-
5
,
5
},
num_thread
);
out
.
GenerateTensorValue
(
GeneratorTensor_1
<
out_data_t
>
{},
num_thread
);
wei
.
GenerateTensorValue
(
GeneratorTensor_2
<
in_data_t
>
{
-
5
,
5
},
num_thread
);
break
;
case
3
:
out
.
GenerateTensorValue
(
GeneratorTensor_2
{
-
5
,
5
},
num_thread
);
wei
.
GenerateTensorValue
(
GeneratorTensor_1
{},
num_thread
);
out
.
GenerateTensorValue
(
GeneratorTensor_2
<
out_data_t
>
{
-
5
,
5
},
num_thread
);
wei
.
GenerateTensorValue
(
GeneratorTensor_1
<
in_data_t
>
{},
num_thread
);
break
;
case
4
:
out
.
GenerateTensorValue
(
GeneratorTensor_2
{
-
5
,
5
},
num_thread
);
wei
.
GenerateTensorValue
(
GeneratorTensor_2
{
-
5
,
5
},
num_thread
);
out
.
GenerateTensorValue
(
GeneratorTensor_2
<
out_data_t
>
{
-
5
,
5
},
num_thread
);
wei
.
GenerateTensorValue
(
GeneratorTensor_2
<
in_data_t
>
{
-
5
,
5
},
num_thread
);
break
;
case
5
:
out
.
GenerateTensorValue
(
GeneratorTensor_3
<
floa
t
>
{
0.0
,
1.0
},
num_thread
);
wei
.
GenerateTensorValue
(
GeneratorTensor_3
<
floa
t
>
{
-
0.5
,
0.5
},
num_thread
);
out
.
GenerateTensorValue
(
GeneratorTensor_3
<
out_data_
t
>
{
0.0
,
1.0
},
num_thread
);
wei
.
GenerateTensorValue
(
GeneratorTensor_3
<
in_data_
t
>
{
-
0.5
,
0.5
},
num_thread
);
break
;
default:
out
.
GenerateTensorValue
(
GeneratorTensor_2
{
1
,
5
},
num_thread
);
out
.
GenerateTensorValue
(
GeneratorTensor_2
<
out_data_t
>
{
1
,
5
},
num_thread
);
auto
gen_wei
=
[](
auto
...
is
)
{
return
GeneratorTensor_2
{
1
,
5
}(
is
...)
*
GeneratorTensor_Checkboard
{}(
is
...);
return
GeneratorTensor_2
<
in_data_t
>
{
1
,
5
}(
is
...)
*
GeneratorTensor_Checkboard
{}(
is
...);
};
wei
.
GenerateTensorValue
(
gen_wei
,
num_thread
);
}
...
...
host/driver_offline/src/conv_fwd_driver_offline.cpp
View file @
8c4e33f1
...
...
@@ -76,6 +76,13 @@ void host_convolution_forward(const Tensor<TIn>& in,
int
wi
=
wo
*
conv_strides
[
I1
]
+
x
*
conv_dilations
[
I1
]
-
in_left_pads
[
I1
];
if
(
hi
>=
0
&&
hi
<
in
.
mDesc
.
GetLengths
()[
2
]
&&
wi
>=
0
&&
wi
<
in
.
mDesc
.
GetLengths
()[
3
])
{
if
constexpr
(
is_same
<
TIn
,
ushort
>::
value
)
{
v
+=
bfloat16_to_float
(
in
(
n
,
c
,
hi
,
wi
))
*
bfloat16_to_float
(
wei
(
k
,
c
,
y
,
x
));
}
else
{
v
+=
static_cast
<
const
double
>
(
in
(
n
,
c
,
hi
,
wi
))
*
static_cast
<
const
double
>
(
wei
(
k
,
c
,
y
,
x
));
...
...
@@ -83,7 +90,16 @@ void host_convolution_forward(const Tensor<TIn>& in,
}
}
}
}
if
constexpr
(
is_same
<
TOut
,
ushort
>::
value
)
{
out
(
n
,
k
,
ho
,
wo
)
=
float_to_bfloat16
(
v
);
}
else
{
out
(
n
,
k
,
ho
,
wo
)
=
v
;
}
};
auto
f_nhwc
=
[
&
](
auto
n
,
auto
ho
,
auto
wo
,
auto
k
)
{
...
...
@@ -98,6 +114,13 @@ void host_convolution_forward(const Tensor<TIn>& in,
int
wi
=
wo
*
conv_strides
[
I1
]
+
x
*
conv_dilations
[
I1
]
-
in_left_pads
[
I1
];
if
(
hi
>=
0
&&
hi
<
in
.
mDesc
.
GetLengths
()[
1
]
&&
wi
>=
0
&&
wi
<
in
.
mDesc
.
GetLengths
()[
2
])
{
if
constexpr
(
is_same
<
TIn
,
ushort
>::
value
)
{
v
+=
bfloat16_to_float
(
in
(
n
,
hi
,
wi
,
c
))
*
bfloat16_to_float
(
wei
(
k
,
y
,
x
,
c
));
}
else
{
v
+=
static_cast
<
const
double
>
(
in
(
n
,
hi
,
wi
,
c
))
*
static_cast
<
const
double
>
(
wei
(
k
,
y
,
x
,
c
));
...
...
@@ -105,7 +128,15 @@ void host_convolution_forward(const Tensor<TIn>& in,
}
}
}
}
if
constexpr
(
is_same
<
TOut
,
ushort
>::
value
)
{
out
(
n
,
ho
,
wo
,
k
)
=
float_to_bfloat16
(
v
);
}
else
{
out
(
n
,
ho
,
wo
,
k
)
=
v
;
}
};
if
(
layout
==
ConvTensorLayout
::
NCHW
)
...
...
@@ -223,10 +254,14 @@ int main(int argc, char* argv[])
using
in_data_t
=
float
;
using
acc_data_t
=
float
;
using
out_data_t
=
float
;
#elif
1
#elif
0
using
in_data_t
=
half_t
;
using
acc_data_t
=
float
;
using
out_data_t
=
half_t
;
#elif 1
using
in_data_t
=
ushort
;
using
acc_data_t
=
float
;
using
out_data_t
=
ushort
;
#elif 1
using
in_data_t
=
int8_t
;
using
acc_data_t
=
int32_t
;
...
...
@@ -292,30 +327,30 @@ int main(int argc, char* argv[])
// no initialization
break
;
case
1
:
in
.
GenerateTensorValue
(
GeneratorTensor_1
{},
num_thread
);
wei
.
GenerateTensorValue
(
GeneratorTensor_1
{},
num_thread
);
in
.
GenerateTensorValue
(
GeneratorTensor_1
<
in_data_t
>
{},
num_thread
);
wei
.
GenerateTensorValue
(
GeneratorTensor_1
<
in_data_t
>
{},
num_thread
);
break
;
case
2
:
in
.
GenerateTensorValue
(
GeneratorTensor_1
{},
num_thread
);
wei
.
GenerateTensorValue
(
GeneratorTensor_2
{
-
5
,
5
},
num_thread
);
in
.
GenerateTensorValue
(
GeneratorTensor_1
<
in_data_t
>
{},
num_thread
);
wei
.
GenerateTensorValue
(
GeneratorTensor_2
<
in_data_t
>
{
-
5
,
5
},
num_thread
);
break
;
case
3
:
in
.
GenerateTensorValue
(
GeneratorTensor_2
{
-
5
,
5
},
num_thread
);
wei
.
GenerateTensorValue
(
GeneratorTensor_1
{},
num_thread
);
in
.
GenerateTensorValue
(
GeneratorTensor_2
<
in_data_t
>
{
-
5
,
5
},
num_thread
);
wei
.
GenerateTensorValue
(
GeneratorTensor_1
<
in_data_t
>
{},
num_thread
);
break
;
case
4
:
in
.
GenerateTensorValue
(
GeneratorTensor_2
{
-
5
,
5
},
num_thread
);
wei
.
GenerateTensorValue
(
GeneratorTensor_2
{
-
5
,
5
},
num_thread
);
in
.
GenerateTensorValue
(
GeneratorTensor_2
<
in_data_t
>
{
-
5
,
5
},
num_thread
);
wei
.
GenerateTensorValue
(
GeneratorTensor_2
<
in_data_t
>
{
-
5
,
5
},
num_thread
);
break
;
case
5
:
in
.
GenerateTensorValue
(
GeneratorTensor_3
<
floa
t
>
{
0.0
,
1.0
},
num_thread
);
wei
.
GenerateTensorValue
(
GeneratorTensor_3
<
floa
t
>
{
-
0.5
,
0.5
},
num_thread
);
in
.
GenerateTensorValue
(
GeneratorTensor_3
<
in_data_
t
>
{
0.0
,
1.0
},
num_thread
);
wei
.
GenerateTensorValue
(
GeneratorTensor_3
<
in_data_
t
>
{
-
0.5
,
0.5
},
num_thread
);
break
;
default:
in
.
GenerateTensorValue
(
GeneratorTensor_2
{
1
,
5
},
num_thread
);
in
.
GenerateTensorValue
(
GeneratorTensor_2
<
in_data_t
>
{
1
,
5
},
num_thread
);
auto
gen_wei
=
[](
auto
...
is
)
{
return
GeneratorTensor_2
{
1
,
5
}(
is
...)
*
GeneratorTensor_Checkboard
{}(
is
...);
return
GeneratorTensor_2
<
in_data_t
>
{
1
,
5
}(
is
...)
*
GeneratorTensor_Checkboard
{}(
is
...);
};
wei
.
GenerateTensorValue
(
gen_wei
,
num_thread
);
}
...
...
host/driver_offline/src/conv_wrw_driver_offline.cpp
View file @
8c4e33f1
...
...
@@ -297,30 +297,30 @@ int main(int argc, char* argv[])
// no initialization
break
;
case
1
:
in
.
GenerateTensorValue
(
GeneratorTensor_1
{},
num_thread
);
out
.
GenerateTensorValue
(
GeneratorTensor_1
{},
num_thread
);
in
.
GenerateTensorValue
(
GeneratorTensor_1
<
in_data_t
>
{},
num_thread
);
out
.
GenerateTensorValue
(
GeneratorTensor_1
<
out_data_t
>
{},
num_thread
);
break
;
case
2
:
in
.
GenerateTensorValue
(
GeneratorTensor_1
{},
num_thread
);
out
.
GenerateTensorValue
(
GeneratorTensor_2
{
-
5
,
5
},
num_thread
);
in
.
GenerateTensorValue
(
GeneratorTensor_1
<
in_data_t
>
{},
num_thread
);
out
.
GenerateTensorValue
(
GeneratorTensor_2
<
out_data_t
>
{
-
5
,
5
},
num_thread
);
break
;
case
3
:
in
.
GenerateTensorValue
(
GeneratorTensor_2
{
-
5
,
5
},
num_thread
);
out
.
GenerateTensorValue
(
GeneratorTensor_1
{},
num_thread
);
in
.
GenerateTensorValue
(
GeneratorTensor_2
<
in_data_t
>
{
-
5
,
5
},
num_thread
);
out
.
GenerateTensorValue
(
GeneratorTensor_1
<
out_data_t
>
{},
num_thread
);
break
;
case
4
:
in
.
GenerateTensorValue
(
GeneratorTensor_2
{
-
5
,
5
},
num_thread
);
out
.
GenerateTensorValue
(
GeneratorTensor_2
{
-
5
,
5
},
num_thread
);
in
.
GenerateTensorValue
(
GeneratorTensor_2
<
in_data_t
>
{
-
5
,
5
},
num_thread
);
out
.
GenerateTensorValue
(
GeneratorTensor_2
<
out_data_t
>
{
-
5
,
5
},
num_thread
);
break
;
case
5
:
in
.
GenerateTensorValue
(
GeneratorTensor_3
<
floa
t
>
{
-
0.1
,
0.1
},
num_thread
);
out
.
GenerateTensorValue
(
GeneratorTensor_3
<
floa
t
>
{
-
0.1
,
0.1
},
num_thread
);
in
.
GenerateTensorValue
(
GeneratorTensor_3
<
in_data_
t
>
{
-
0.1
,
0.1
},
num_thread
);
out
.
GenerateTensorValue
(
GeneratorTensor_3
<
out_data_
t
>
{
-
0.1
,
0.1
},
num_thread
);
break
;
default:
in
.
GenerateTensorValue
(
GeneratorTensor_2
{
1
,
5
},
num_thread
);
in
.
GenerateTensorValue
(
GeneratorTensor_2
<
in_data_t
>
{
1
,
5
},
num_thread
);
auto
gen_out
=
[](
auto
...
is
)
{
return
GeneratorTensor_2
{
1
,
5
}(
is
...)
*
GeneratorTensor_Checkboard
{}(
is
...);
return
GeneratorTensor_2
<
out_data_t
>
{
1
,
5
}(
is
...)
*
GeneratorTensor_Checkboard
{}(
is
...);
};
out
.
GenerateTensorValue
(
gen_out
,
num_thread
);
}
...
...
host/driver_offline/src/gemm_driver_offline.cpp
View file @
8c4e33f1
...
...
@@ -10,7 +10,6 @@
#include "device.hpp"
#include "host_tensor.hpp"
#include "host_tensor_generator.hpp"
#include "gemm_common.hpp"
#include "host_gemm.hpp"
#include "device_tensor.hpp"
#include "device_gemm_xdlops_mk_kn_mn.hpp"
...
...
@@ -31,6 +30,18 @@
#define USE_GEMM_XDL_KM_KN_NM 0
#define USE_GEMM_XDL_KM_NK_NM 0
enum
GemmMatrixLayout
{
MK_KN_MN
,
// 0
MK_NK_MN
,
// 1
KM_KN_MN
,
// 2
KM_NK_MN
,
// 3
MK_KN_NM
,
// 4
MK_NK_NM
,
// 5
KM_KN_NM
,
// 6
KM_NK_NM
// 7
};
enum
GemmAlgo
{
Xdl_MK_KN_MN
,
// 0
...
...
@@ -43,6 +54,161 @@ enum GemmAlgo
Xdl_KM_NK_NM
,
// 7
};
template
<
typename
AType
,
typename
BType
,
typename
CType
>
void
host_gemm
(
const
Tensor
<
AType
>&
a
,
const
Tensor
<
BType
>&
b
,
Tensor
<
CType
>&
c
,
const
GemmMatrixLayout
layout
)
{
if
(
layout
==
GemmMatrixLayout
::
MK_KN_MN
)
{
auto
f_mk_kn_mn
=
[
&
](
auto
m
,
auto
n
)
{
const
int
K
=
a
.
mDesc
.
GetLengths
()[
1
];
double
v
=
0
;
for
(
int
k
=
0
;
k
<
K
;
++
k
)
{
v
+=
static_cast
<
const
double
>
(
a
(
m
,
k
))
*
static_cast
<
const
double
>
(
b
(
k
,
n
));
}
c
(
m
,
n
)
=
v
;
};
make_ParallelTensorFunctor
(
f_mk_kn_mn
,
c
.
mDesc
.
GetLengths
()[
0
],
c
.
mDesc
.
GetLengths
()[
1
])(
std
::
thread
::
hardware_concurrency
());
}
else
if
(
layout
==
GemmMatrixLayout
::
MK_NK_MN
)
{
auto
f_mk_nk_mn
=
[
&
](
auto
m
,
auto
n
)
{
const
int
K
=
a
.
mDesc
.
GetLengths
()[
1
];
double
v
=
0
;
for
(
int
k
=
0
;
k
<
K
;
++
k
)
{
v
+=
static_cast
<
const
double
>
(
a
(
m
,
k
))
*
static_cast
<
const
double
>
(
b
(
n
,
k
));
}
c
(
m
,
n
)
=
v
;
};
make_ParallelTensorFunctor
(
f_mk_nk_mn
,
c
.
mDesc
.
GetLengths
()[
0
],
c
.
mDesc
.
GetLengths
()[
1
])(
std
::
thread
::
hardware_concurrency
());
}
else
if
(
layout
==
GemmMatrixLayout
::
KM_KN_MN
)
{
auto
f_km_kn_mn
=
[
&
](
auto
m
,
auto
n
)
{
const
int
K
=
a
.
mDesc
.
GetLengths
()[
0
];
double
v
=
0
;
for
(
int
k
=
0
;
k
<
K
;
++
k
)
{
v
+=
static_cast
<
const
double
>
(
a
(
k
,
m
))
*
static_cast
<
const
double
>
(
b
(
k
,
n
));
}
c
(
m
,
n
)
=
v
;
};
make_ParallelTensorFunctor
(
f_km_kn_mn
,
c
.
mDesc
.
GetLengths
()[
0
],
c
.
mDesc
.
GetLengths
()[
1
])(
std
::
thread
::
hardware_concurrency
());
}
else
if
(
layout
==
GemmMatrixLayout
::
KM_NK_MN
)
{
auto
f_km_nk_mn
=
[
&
](
auto
m
,
auto
n
)
{
const
int
K
=
a
.
mDesc
.
GetLengths
()[
0
];
double
v
=
0
;
for
(
int
k
=
0
;
k
<
K
;
++
k
)
{
v
+=
static_cast
<
const
double
>
(
a
(
k
,
m
))
*
static_cast
<
const
double
>
(
b
(
n
,
k
));
}
c
(
m
,
n
)
=
v
;
};
make_ParallelTensorFunctor
(
f_km_nk_mn
,
c
.
mDesc
.
GetLengths
()[
0
],
c
.
mDesc
.
GetLengths
()[
1
])(
std
::
thread
::
hardware_concurrency
());
}
else
if
(
layout
==
GemmMatrixLayout
::
MK_KN_NM
)
{
auto
f_mk_kn_nm
=
[
&
](
auto
n
,
auto
m
)
{
const
int
K
=
a
.
mDesc
.
GetLengths
()[
1
];
double
v
=
0
;
for
(
int
k
=
0
;
k
<
K
;
++
k
)
{
v
+=
static_cast
<
const
double
>
(
a
(
m
,
k
))
*
static_cast
<
const
double
>
(
b
(
k
,
n
));
}
c
(
n
,
m
)
=
v
;
};
make_ParallelTensorFunctor
(
f_mk_kn_nm
,
c
.
mDesc
.
GetLengths
()[
0
],
c
.
mDesc
.
GetLengths
()[
1
])(
std
::
thread
::
hardware_concurrency
());
}
else
if
(
layout
==
GemmMatrixLayout
::
MK_NK_NM
)
{
auto
f_mk_nk_nm
=
[
&
](
auto
n
,
auto
m
)
{
const
int
K
=
a
.
mDesc
.
GetLengths
()[
1
];
double
v
=
0
;
for
(
int
k
=
0
;
k
<
K
;
++
k
)
{
v
+=
static_cast
<
const
double
>
(
a
(
m
,
k
))
*
static_cast
<
const
double
>
(
b
(
n
,
k
));
}
c
(
n
,
m
)
=
v
;
};
make_ParallelTensorFunctor
(
f_mk_nk_nm
,
c
.
mDesc
.
GetLengths
()[
0
],
c
.
mDesc
.
GetLengths
()[
1
])(
std
::
thread
::
hardware_concurrency
());
}
else
if
(
layout
==
GemmMatrixLayout
::
KM_KN_NM
)
{
auto
f_km_kn_nm
=
[
&
](
auto
n
,
auto
m
)
{
const
int
K
=
a
.
mDesc
.
GetLengths
()[
0
];
double
v
=
0
;
for
(
int
k
=
0
;
k
<
K
;
++
k
)
{
v
+=
static_cast
<
const
double
>
(
a
(
k
,
m
))
*
static_cast
<
const
double
>
(
b
(
k
,
n
));
}
c
(
n
,
m
)
=
v
;
};
make_ParallelTensorFunctor
(
f_km_kn_nm
,
c
.
mDesc
.
GetLengths
()[
0
],
c
.
mDesc
.
GetLengths
()[
1
])(
std
::
thread
::
hardware_concurrency
());
}
else
if
(
layout
==
GemmMatrixLayout
::
KM_NK_NM
)
{
auto
f_km_nk_nm
=
[
&
](
auto
n
,
auto
m
)
{
const
int
K
=
a
.
mDesc
.
GetLengths
()[
0
];
double
v
=
0
;
for
(
int
k
=
0
;
k
<
K
;
++
k
)
{
v
+=
static_cast
<
const
double
>
(
a
(
k
,
m
))
*
static_cast
<
const
double
>
(
b
(
n
,
k
));
}
c
(
n
,
m
)
=
v
;
};
make_ParallelTensorFunctor
(
f_km_nk_nm
,
c
.
mDesc
.
GetLengths
()[
0
],
c
.
mDesc
.
GetLengths
()[
1
])(
std
::
thread
::
hardware_concurrency
());
}
else
{
throw
std
::
runtime_error
(
"wrong! not supported layout"
);
}
}
int
main
(
int
argc
,
char
*
argv
[])
{
using
namespace
ck
;
...
...
@@ -73,10 +239,14 @@ int main(int argc, char* argv[])
using ab_data_t = float;
using acc_data_t = float;
using c_data_t = float;
#elif
1
#elif
0
using
ab_data_t
=
half_t
;
using
acc_data_t
=
float
;
using
c_data_t
=
half_t
;
#elif 1
using
ab_data_t
=
ushort
;
using
acc_data_t
=
float
;
using
c_data_t
=
ushort
;
#elif 1
using
ab_data_t
=
int8_t
;
using
acc_data_t
=
int32_t
;
...
...
@@ -155,24 +325,24 @@ int main(int argc, char* argv[])
// no initialization
break
;
case
1
:
a
.
GenerateTensorValue
(
GeneratorTensor_1
{},
num_thread
);
b
.
GenerateTensorValue
(
GeneratorTensor_1
{},
num_thread
);
a
.
GenerateTensorValue
(
GeneratorTensor_1
<
ab_data_t
>
{},
num_thread
);
b
.
GenerateTensorValue
(
GeneratorTensor_1
<
ab_data_t
>
{},
num_thread
);
break
;
case
2
:
a
.
GenerateTensorValue
(
GeneratorTensor_1
{},
num_thread
);
b
.
GenerateTensorValue
(
GeneratorTensor_2
{
-
5
,
5
},
num_thread
);
a
.
GenerateTensorValue
(
GeneratorTensor_1
<
ab_data_t
>
{},
num_thread
);
b
.
GenerateTensorValue
(
GeneratorTensor_2
<
ab_data_t
>
{
-
5
,
5
},
num_thread
);
break
;
case
3
:
a
.
GenerateTensorValue
(
GeneratorTensor_2
{
-
5
,
5
},
num_thread
);
b
.
GenerateTensorValue
(
GeneratorTensor_1
{},
num_thread
);
a
.
GenerateTensorValue
(
GeneratorTensor_2
<
ab_data_t
>
{
-
5
,
5
},
num_thread
);
b
.
GenerateTensorValue
(
GeneratorTensor_1
<
ab_data_t
>
{},
num_thread
);
break
;
case
4
:
a
.
GenerateTensorValue
(
GeneratorTensor_2
{
-
5
,
5
},
num_thread
);
b
.
GenerateTensorValue
(
GeneratorTensor_2
{
-
5
,
5
},
num_thread
);
a
.
GenerateTensorValue
(
GeneratorTensor_2
<
ab_data_t
>
{
-
5
,
5
},
num_thread
);
b
.
GenerateTensorValue
(
GeneratorTensor_2
<
ab_data_t
>
{
-
5
,
5
},
num_thread
);
break
;
default:
a
.
GenerateTensorValue
(
GeneratorTensor_3
<
floa
t
>
{
0.0
,
1.0
},
num_thread
);
b
.
GenerateTensorValue
(
GeneratorTensor_3
<
floa
t
>
{
-
0.5
,
0.5
},
num_thread
);
a
.
GenerateTensorValue
(
GeneratorTensor_3
<
ab_data_
t
>
{
0.0
,
1.0
},
num_thread
);
b
.
GenerateTensorValue
(
GeneratorTensor_3
<
ab_data_
t
>
{
-
0.5
,
0.5
},
num_thread
);
}
#if USE_GEMM_XDL_MK_KN_MN
...
...
host/host_tensor/include/host_gemm.hpp
View file @
8c4e33f1
#pragma once
#include "host_tensor.hpp"
#include "gemm_common.hpp"
template
<
typename
AType
,
typename
BType
,
typename
CType
>
void
host_gemm
(
const
Tensor
<
AType
>&
a
,
const
Tensor
<
BType
>&
b
,
Tensor
<
CType
>&
c
,
template
<
>
void
host_gemm
<
ushort
,
ushort
,
ushort
>
(
const
Tensor
<
ushort
>&
a
,
const
Tensor
<
ushort
>&
b
,
Tensor
<
ushort
>&
c
,
const
GemmMatrixLayout
layout
)
{
if
(
layout
==
GemmMatrixLayout
::
MK_KN_MN
)
...
...
@@ -17,10 +16,10 @@ void host_gemm(const Tensor<AType>& a,
for
(
int
k
=
0
;
k
<
K
;
++
k
)
{
v
+=
static_cast
<
const
double
>
(
a
(
m
,
k
))
*
static_cast
<
const
double
>
(
b
(
k
,
n
));
v
+=
bfloat16_to_float
(
a
(
m
,
k
))
*
bfloat16_to_float
(
b
(
k
,
n
));
}
c
(
m
,
n
)
=
v
;
c
(
m
,
n
)
=
float_to_bfloat16
(
v
)
;
};
make_ParallelTensorFunctor
(
f_mk_kn_mn
,
c
.
mDesc
.
GetLengths
()[
0
],
c
.
mDesc
.
GetLengths
()[
1
])(
...
...
@@ -35,10 +34,10 @@ void host_gemm(const Tensor<AType>& a,
for
(
int
k
=
0
;
k
<
K
;
++
k
)
{
v
+=
static_cast
<
const
double
>
(
a
(
m
,
k
))
*
static_cast
<
const
double
>
(
b
(
n
,
k
));
v
+=
bfloat16_to_float
(
a
(
m
,
k
))
*
bfloat16_to_float
(
b
(
n
,
k
));
}
c
(
m
,
n
)
=
v
;
c
(
m
,
n
)
=
float_to_bfloat16
(
v
)
;
};
make_ParallelTensorFunctor
(
f_mk_nk_mn
,
c
.
mDesc
.
GetLengths
()[
0
],
c
.
mDesc
.
GetLengths
()[
1
])(
...
...
@@ -53,10 +52,10 @@ void host_gemm(const Tensor<AType>& a,
for
(
int
k
=
0
;
k
<
K
;
++
k
)
{
v
+=
static_cast
<
const
double
>
(
a
(
k
,
m
))
*
static_cast
<
const
double
>
(
b
(
k
,
n
));
v
+=
bfloat16_to_float
(
a
(
k
,
m
))
*
bfloat16_to_float
(
b
(
k
,
n
));
}
c
(
m
,
n
)
=
v
;
c
(
m
,
n
)
=
float_to_bfloat16
(
v
)
;
};
make_ParallelTensorFunctor
(
f_km_kn_mn
,
c
.
mDesc
.
GetLengths
()[
0
],
c
.
mDesc
.
GetLengths
()[
1
])(
...
...
@@ -71,10 +70,10 @@ void host_gemm(const Tensor<AType>& a,
for
(
int
k
=
0
;
k
<
K
;
++
k
)
{
v
+=
static_cast
<
const
double
>
(
a
(
k
,
m
))
*
static_cast
<
const
double
>
(
b
(
n
,
k
));
v
+=
bfloat16_to_float
(
a
(
k
,
m
))
*
bfloat16_to_float
(
b
(
n
,
k
));
}
c
(
m
,
n
)
=
v
;
c
(
m
,
n
)
=
float_to_bfloat16
(
v
)
;
};
make_ParallelTensorFunctor
(
f_km_nk_mn
,
c
.
mDesc
.
GetLengths
()[
0
],
c
.
mDesc
.
GetLengths
()[
1
])(
...
...
@@ -89,10 +88,10 @@ void host_gemm(const Tensor<AType>& a,
for
(
int
k
=
0
;
k
<
K
;
++
k
)
{
v
+=
static_cast
<
const
double
>
(
a
(
m
,
k
))
*
static_cast
<
const
double
>
(
b
(
k
,
n
));
v
+=
bfloat16_to_float
(
a
(
m
,
k
))
*
bfloat16_to_float
(
b
(
k
,
n
));
}
c
(
n
,
m
)
=
v
;
c
(
n
,
m
)
=
float_to_bfloat16
(
v
)
;
};
make_ParallelTensorFunctor
(
f_mk_kn_nm
,
c
.
mDesc
.
GetLengths
()[
0
],
c
.
mDesc
.
GetLengths
()[
1
])(
...
...
@@ -107,10 +106,10 @@ void host_gemm(const Tensor<AType>& a,
for
(
int
k
=
0
;
k
<
K
;
++
k
)
{
v
+=
static_cast
<
const
double
>
(
a
(
m
,
k
))
*
static_cast
<
const
double
>
(
b
(
n
,
k
));
v
+=
bfloat16_to_float
(
a
(
m
,
k
))
*
bfloat16_to_float
(
b
(
n
,
k
));
}
c
(
n
,
m
)
=
v
;
c
(
n
,
m
)
=
float_to_bfloat16
(
v
)
;
};
make_ParallelTensorFunctor
(
f_mk_nk_nm
,
c
.
mDesc
.
GetLengths
()[
0
],
c
.
mDesc
.
GetLengths
()[
1
])(
...
...
@@ -125,10 +124,10 @@ void host_gemm(const Tensor<AType>& a,
for
(
int
k
=
0
;
k
<
K
;
++
k
)
{
v
+=
static_cast
<
const
double
>
(
a
(
k
,
m
))
*
static_cast
<
const
double
>
(
b
(
k
,
n
));
v
+=
bfloat16_to_float
(
a
(
k
,
m
))
*
bfloat16_to_float
(
b
(
k
,
n
));
}
c
(
n
,
m
)
=
v
;
c
(
n
,
m
)
=
float_to_bfloat16
(
v
)
;
};
make_ParallelTensorFunctor
(
f_km_kn_nm
,
c
.
mDesc
.
GetLengths
()[
0
],
c
.
mDesc
.
GetLengths
()[
1
])(
...
...
@@ -143,10 +142,10 @@ void host_gemm(const Tensor<AType>& a,
for
(
int
k
=
0
;
k
<
K
;
++
k
)
{
v
+=
static_cast
<
const
double
>
(
a
(
k
,
m
))
*
static_cast
<
const
double
>
(
b
(
n
,
k
));
v
+=
bfloat16_to_float
(
a
(
k
,
m
))
*
bfloat16_to_float
(
b
(
n
,
k
));
}
c
(
n
,
m
)
=
v
;
c
(
n
,
m
)
=
float_to_bfloat16
(
v
)
;
};
make_ParallelTensorFunctor
(
f_km_nk_nm
,
c
.
mDesc
.
GetLengths
()[
0
],
c
.
mDesc
.
GetLengths
()[
1
])(
...
...
host/host_tensor/include/host_tensor.hpp
View file @
8c4e33f1
...
...
@@ -333,4 +333,41 @@ void check_error(const Tensor<T>& ref, const Tensor<T>& result)
std
::
cout
<<
"max_diff: "
<<
max_diff
<<
", "
<<
ref_value
<<
", "
<<
result_value
<<
std
::
endl
;
}
float
bf16_to_f32
(
ushort
src_val
)
{
typedef
union
{
ushort
x
,
y
;
float
f32
;
}
bf16_f32_t
;
bf16_f32_t
v
;
v
.
x
=
0
;
v
.
y
=
src_val
;
return
v
.
f32
;
}
template
<
>
void
check_error
<
ushort
>
(
const
Tensor
<
ushort
>&
ref
,
const
Tensor
<
ushort
>&
result
)
{
float
error
=
0
;
float
max_diff
=
-
1
;
float
ref_value
=
0
,
result_value
=
0
;
for
(
int
i
=
0
;
i
<
ref
.
mData
.
size
();
++
i
)
{
error
+=
std
::
abs
(
bf16_to_f32
(
ref
.
mData
[
i
])
-
bf16_to_f32
(
result
.
mData
[
i
]));
float
diff
=
std
::
abs
(
bf16_to_f32
(
ref
.
mData
[
i
])
-
bf16_to_f32
(
result
.
mData
[
i
]));
if
(
max_diff
<
diff
)
{
max_diff
=
diff
;
ref_value
=
bf16_to_f32
(
ref
.
mData
[
i
]);
result_value
=
bf16_to_f32
(
result
.
mData
[
i
]);
}
}
std
::
cout
<<
"error: "
<<
error
<<
std
::
endl
;
std
::
cout
<<
"max_diff: "
<<
max_diff
<<
", ref: "
<<
ref_value
<<
", res: "
<<
result_value
<<
std
::
endl
;
}
#endif
host/host_tensor/include/host_tensor_generator.hpp
View file @
8c4e33f1
...
...
@@ -4,6 +4,7 @@
#include <cmath>
#include "config.hpp"
template
<
typename
T
>
struct
GeneratorTensor_1
{
int
value
=
1
;
...
...
@@ -15,6 +16,30 @@ struct GeneratorTensor_1
}
};
template
<
>
struct
GeneratorTensor_1
<
ushort
>
{
float
value
=
1.0
;
template
<
typename
...
Is
>
ushort
operator
()(
Is
...)
{
return
float_to_bfloat16
(
value
);
}
};
template
<
>
struct
GeneratorTensor_1
<
int8_t
>
{
int8_t
value
=
1
;
template
<
typename
...
Is
>
int8_t
operator
()(
Is
...)
{
return
value
;
}
};
struct
GeneratorTensor_0
{
int
value
=
0
;
...
...
@@ -26,6 +51,7 @@ struct GeneratorTensor_0
}
};
template
<
typename
T
>
struct
GeneratorTensor_2
{
int
min_value
=
0
;
...
...
@@ -38,6 +64,33 @@ struct GeneratorTensor_2
}
};
template
<
>
struct
GeneratorTensor_2
<
ushort
>
{
int
min_value
=
0
;
int
max_value
=
1
;
template
<
typename
...
Is
>
ushort
operator
()(
Is
...)
{
float
tmp
=
(
std
::
rand
()
%
(
max_value
-
min_value
))
+
min_value
;
return
float_to_bfloat16
(
tmp
);
}
};
template
<
>
struct
GeneratorTensor_2
<
int8_t
>
{
int
min_value
=
0
;
int
max_value
=
1
;
template
<
typename
...
Is
>
int8_t
operator
()(
Is
...)
{
return
(
std
::
rand
()
%
(
max_value
-
min_value
))
+
min_value
;
}
};
template
<
typename
T
>
struct
GeneratorTensor_3
{
...
...
@@ -53,6 +106,39 @@ struct GeneratorTensor_3
}
};
template
<
>
struct
GeneratorTensor_3
<
ushort
>
{
float
min_value
=
0
;
float
max_value
=
1
;
template
<
typename
...
Is
>
ushort
operator
()(
Is
...)
{
float
tmp
=
float
(
std
::
rand
())
/
float
(
RAND_MAX
);
float
fp32_tmp
=
min_value
+
tmp
*
(
max_value
-
min_value
);
return
float_to_bfloat16
(
fp32_tmp
);
}
};
template
<
>
struct
GeneratorTensor_3
<
int8_t
>
{
float
min_value
=
0
;
float
max_value
=
1
;
template
<
typename
...
Is
>
int8_t
operator
()(
Is
...)
{
int8_t
min_tmp
=
static_cast
<
int8_t
>
(
min_value
);
int8_t
max_tmp
=
static_cast
<
int8_t
>
(
max_value
);
return
(
std
::
rand
()
%
(
max_tmp
-
min_tmp
))
+
min_tmp
;
}
};
struct
GeneratorTensor_Checkboard
{
template
<
typename
...
Ts
>
...
...
profiler/gemm_profiler.cpp
View file @
8c4e33f1
...
...
@@ -9,13 +9,30 @@
#include "device.hpp"
#include "host_tensor.hpp"
#include "host_tensor_generator.hpp"
#include "gemm_common.hpp"
#include "host_gemm.hpp"
#include "device_tensor.hpp"
#include "device_base.hpp"
#include "device_gemm_xdl.hpp"
#include "profile_gemm.hpp"
enum
GemmMatrixLayout
{
MK_KN_MN
,
// 0
MK_NK_MN
,
// 1
KM_KN_MN
,
// 2
KM_NK_MN
,
// 3
MK_KN_NM
,
// 4
MK_NK_NM
,
// 5
KM_KN_NM
,
// 6
KM_NK_NM
,
// 7
};
enum
GemmDataType
{
F32_F32_F32
,
// 0
F16_F16_F16
,
// 1
};
int
gemm_profiler
(
int
argc
,
char
*
argv
[])
{
if
(
argc
!=
14
)
...
...
script/profile_conv.sh
0 → 100755
View file @
8c4e33f1
#!/bin/bash
## GPU visibility
export
HIP_VISIBLE_DEVICES
=
0
make
-j
ckProfiler
DRIVER
=
"./profiler/ckProfiler"
OP
=
$1
DATATYPE
=
$2
IN_LAYOUT
=
$3
WEI_LAYOUT
=
$4
OUT_LAYOUT
=
$5
VERIFY
=
$6
INIT
=
$7
LOG
=
$8
REPEAT
=
$9
# test
######## op datatype in_layout wei_layout out_layout verify init log repeat N__ K___ C___ Y X Hi__ Wi__ Strides Dilations LeftPads RightPads Desired_grid_size__
$DRIVER
$OP
$DATATYPE
$IN_LAYOUT
$WEI_LAYOUT
$OUT_LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
128 256 192 3 3 71 71 2 2 1 1 1 1 1 1
$DESIRED_GRID_SIZE
#N=${10}
# Resnet50
######## op datatype in_layout wei_layout out_layout verify init log repeat N__ K___ C___ Y X Hi__ Wi__ Strides Dilations LeftPads RightPads Desired_grid_size__
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 2048 1024 1 1 14 14 2 2 1 1 0 0 0 0 $DESIRED_GRID_SIZE
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 256 1024 1 1 14 14 1 1 1 1 0 0 0 0 $DESIRED_GRID_SIZE
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 512 1024 1 1 14 14 1 1 1 1 0 0 0 0 $DESIRED_GRID_SIZE
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 128 128 3 3 28 28 1 1 1 1 1 1 1 1 $DESIRED_GRID_SIZE
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 512 128 1 1 28 28 1 1 1 1 0 0 0 0 $DESIRED_GRID_SIZE
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 128 128 3 3 58 58 2 2 1 1 0 0 0 0 $DESIRED_GRID_SIZE
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 512 2048 1 1 7 7 1 1 1 1 0 0 0 0 $DESIRED_GRID_SIZE
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 1024 256 1 1 14 14 1 1 1 1 0 0 0 0 $DESIRED_GRID_SIZE
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 256 256 3 3 14 14 1 1 1 1 1 1 1 1 $DESIRED_GRID_SIZE
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 256 256 3 3 30 30 2 2 1 1 0 0 0 0 $DESIRED_GRID_SIZE
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 128 256 1 1 56 56 1 1 1 1 0 0 0 0 $DESIRED_GRID_SIZE
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 512 256 1 1 56 56 2 2 1 1 0 0 0 0 $DESIRED_GRID_SIZE
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 64 256 1 1 56 56 1 1 1 1 0 0 0 0 $DESIRED_GRID_SIZE
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 512 512 3 3 16 16 2 2 1 1 0 0 0 0 $DESIRED_GRID_SIZE
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 1024 512 1 1 28 28 2 2 1 1 0 0 0 0 $DESIRED_GRID_SIZE
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 128 512 1 1 28 28 1 1 1 1 0 0 0 0 $DESIRED_GRID_SIZE
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 256 512 1 1 28 28 1 1 1 1 0 0 0 0 $DESIRED_GRID_SIZE
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 2048 512 1 1 7 7 1 1 1 1 0 0 0 0 $DESIRED_GRID_SIZE
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 512 512 3 3 7 7 1 1 1 1 1 1 1 1 $DESIRED_GRID_SIZE
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 256 64 1 1 56 56 1 1 1 1 0 0 0 0 $DESIRED_GRID_SIZE
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 64 64 1 1 56 56 1 1 1 1 0 0 0 0 $DESIRED_GRID_SIZE
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 64 64 3 3 56 56 1 1 1 1 1 1 1 1 $DESIRED_GRID_SIZE
# SSD
######## op datatype in_layout wei_layout out_layout verify init log repeat N__ K___ C___ Y X Hi__ Wi__ Strides Dilations LeftPads RightPads Desired_grid_size__
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT 120 64 3 7 7 300 300 2 2 1 1 3 3 3 3
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT 120 64 64 3 3 75 75 1 1 1 1 1 1 1 1
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT 120 64 64 3 3 75 75 1 1 1 1 1 1 1 1
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT 120 64 64 3 3 75 75 1 1 1 1 1 1 1 1
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT 120 64 64 3 3 75 75 1 1 1 1 1 1 1 1
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT 120 64 64 3 3 75 75 1 1 1 1 1 1 1 1
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT 120 64 64 3 3 75 75 1 1 1 1 1 1 1 1
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT 120 128 64 1 1 75 75 2 2 1 1 0 0 0 0
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT 120 128 64 3 3 75 75 2 2 1 1 1 1 1 1
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT 120 128 128 3 3 38 38 1 1 1 1 1 1 1 1
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT 120 128 128 3 3 38 38 1 1 1 1 1 1 1 1
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT 120 128 128 3 3 38 38 1 1 1 1 1 1 1 1
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT 120 128 128 3 3 38 38 1 1 1 1 1 1 1 1
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT 120 128 128 3 3 38 38 1 1 1 1 1 1 1 1
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT 120 128 128 3 3 38 38 1 1 1 1 1 1 1 1
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT 120 128 128 3 3 38 38 1 1 1 1 1 1 1 1
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT 120 256 128 1 1 38 38 1 1 1 1 0 0 0 0
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT 120 256 128 3 3 38 38 1 1 1 1 1 1 1 1
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT 120 256 256 3 3 38 38 1 1 1 1 1 1 1 1
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT 120 256 256 3 3 38 38 1 1 1 1 1 1 1 1
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT 120 256 256 3 3 38 38 1 1 1 1 1 1 1 1
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT 120 256 256 3 3 38 38 1 1 1 1 1 1 1 1
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT 120 256 256 3 3 38 38 1 1 1 1 1 1 1 1
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT 120 256 256 3 3 38 38 1 1 1 1 1 1 1 1
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT 120 256 256 3 3 38 38 1 1 1 1 1 1 1 1
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT 120 256 256 3 3 38 38 1 1 1 1 1 1 1 1
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT 120 256 256 3 3 38 38 1 1 1 1 1 1 1 1
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT 120 256 256 3 3 38 38 1 1 1 1 1 1 1 1
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT 120 256 256 3 3 38 38 1 1 1 1 1 1 1 1
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT 120 256 256 1 1 38 38 1 1 1 1 0 0 0 0
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT 120 512 256 3 3 38 38 2 2 1 1 1 1 1 1
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT 120 256 512 1 1 19 19 1 1 1 1 0 0 0 0
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT 120 512 256 3 3 19 19 2 2 1 1 1 1 1 1
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT 120 128 512 1 1 10 10 1 1 1 1 0 0 0 0
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT 120 256 128 3 3 10 10 2 2 1 1 1 1 1 1
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT 120 128 256 1 1 5 5 1 1 1 1 0 0 0 0
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT 120 256 128 3 3 5 5 1 1 1 1 0 0 0 0
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT 120 128 256 1 1 3 3 1 1 1 1 0 0 0 0
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT 120 256 128 3 3 3 3 1 1 1 1 0 0 0 0
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT 120 340 256 3 3 38 38 1 1 1 1 1 1 1 1
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT 120 510 512 3 3 19 19 1 1 1 1 1 1 1 1
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT 120 510 512 3 3 10 10 1 1 1 1 1 1 1 1
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT 120 510 256 3 3 5 5 1 1 1 1 1 1 1 1
#$DRIVER $OP $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT 120 340 256 3 3 3 3 1 1 1 1 1 1 1 1
script/profile_gemm.sh
0 → 100755
View file @
8c4e33f1
#!/bin/bash
## GPU visibility
export
HIP_VISIBLE_DEVICES
=
0
make
-j
ckProfiler
DRIVER
=
"./profiler/ckProfiler"
OP
=
$1
DATATYPE
=
$2
LAYOUT
=
$3
VERIFY
=
$4
INIT
=
$5
LOG
=
$6
REPEAT
=
$7
######## op datatype layout verify init log repeat M___ N___ K___ StrideA StrideB StrideC
#$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 256 256 256 256 256 256
#$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 960 1024 1024 1024 1024 1024
#$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 1024 1024 1024 1024 1024 1024
#$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 1920 2048 2048 2048 2048 2048
$DRIVER
$OP
$DATATYPE
$LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
3840 4096 4096 4096 4096 4096
#$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 7680 8192 8192 8192 8192 8192
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment