Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel_ROCM
Commits
3289a5c9
Commit
3289a5c9
authored
Nov 21, 2024
by
Andriy Roshchenko
Browse files
Narrowing the scope of PR to OCP FP8 enablement only
parent
dbfb222d
Changes
63
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
91 additions
and
400 deletions
+91
-400
example/01_gemm/gemm_xdl_int8.cpp
example/01_gemm/gemm_xdl_int8.cpp
+1
-1
example/01_gemm/gemm_xdl_lds_direct_load_fp16.cpp
example/01_gemm/gemm_xdl_lds_direct_load_fp16.cpp
+1
-1
example/01_gemm/gemm_xdl_lds_direct_load_fp32.cpp
example/01_gemm/gemm_xdl_lds_direct_load_fp32.cpp
+1
-1
example/01_gemm/gemm_xdl_streamk.cpp
example/01_gemm/gemm_xdl_streamk.cpp
+1
-1
example/01_gemm/gemm_xdl_wavelet_fp16.cpp
example/01_gemm/gemm_xdl_wavelet_fp16.cpp
+1
-1
example/01_gemm/run_gemm_example.inc
example/01_gemm/run_gemm_example.inc
+23
-59
example/01_gemm/run_gemm_example_streamk_v2.inc
example/01_gemm/run_gemm_example_streamk_v2.inc
+11
-18
example/01_gemm/run_gemm_example_v2.inc
example/01_gemm/run_gemm_example_v2.inc
+19
-18
example/04_gemm_add_add_fastgelu/common.hpp
example/04_gemm_add_add_fastgelu/common.hpp
+2
-2
example/04_gemm_add_add_fastgelu/gemm_add_add_fastgelu_xdl_bf16.cpp
..._gemm_add_add_fastgelu/gemm_add_add_fastgelu_xdl_bf16.cpp
+2
-2
example/04_gemm_add_add_fastgelu/run_gemm_add_add_fastgelu_example.inc
...mm_add_add_fastgelu/run_gemm_add_add_fastgelu_example.inc
+1
-90
example/15_grouped_gemm/grouped_gemm_xdl_fixed_nk_fp16.cpp
example/15_grouped_gemm/grouped_gemm_xdl_fixed_nk_fp16.cpp
+2
-2
example/15_grouped_gemm/grouped_gemm_xdl_fixed_nk_fp16_fp8.cpp
...le/15_grouped_gemm/grouped_gemm_xdl_fixed_nk_fp16_fp8.cpp
+4
-8
example/15_grouped_gemm/run_grouped_gemm_example.inc
example/15_grouped_gemm/run_grouped_gemm_example.inc
+4
-7
example/20_grouped_conv_bwd_weight/common.hpp
example/20_grouped_conv_bwd_weight/common.hpp
+2
-2
example/20_grouped_conv_bwd_weight/run_grouped_conv_bwd_weight_example.inc
...d_conv_bwd_weight/run_grouped_conv_bwd_weight_example.inc
+4
-9
example/21_gemm_layernorm/gemm_xdl_layernorm_naive_single_kernel_fp16.cpp
...layernorm/gemm_xdl_layernorm_naive_single_kernel_fp16.cpp
+2
-2
example/35_splitK_gemm/run_splitK_gemm_example.inc
example/35_splitK_gemm/run_splitK_gemm_example.inc
+8
-172
example/35_splitK_gemm/splitK_gemm_xdl_bf16.cpp
example/35_splitK_gemm/splitK_gemm_xdl_bf16.cpp
+1
-2
example/35_splitK_gemm/splitK_gemm_xdl_fp16.cpp
example/35_splitK_gemm/splitK_gemm_xdl_fp16.cpp
+1
-2
No files found.
example/01_gemm/gemm_xdl_int8.cpp
View file @
3289a5c9
...
@@ -46,4 +46,4 @@ using ReferenceGemmInstanceGPU = ck::tensor_operation::device::ReferenceGemm<ALa
...
@@ -46,4 +46,4 @@ using ReferenceGemmInstanceGPU = ck::tensor_operation::device::ReferenceGemm<ALa
#include "run_gemm_example.inc"
#include "run_gemm_example.inc"
int
main
(
int
argc
,
char
*
argv
[])
{
return
(
run_gemm_example
(
argc
,
argv
)
?
0
:
-
1
)
;
}
int
main
(
int
argc
,
char
*
argv
[])
{
return
!
run_gemm_example
(
argc
,
argv
);
}
example/01_gemm/gemm_xdl_lds_direct_load_fp16.cpp
View file @
3289a5c9
...
@@ -66,4 +66,4 @@ using ReferenceGemmInstanceGPU = ck::tensor_operation::device::ReferenceGemm<ALa
...
@@ -66,4 +66,4 @@ using ReferenceGemmInstanceGPU = ck::tensor_operation::device::ReferenceGemm<ALa
#include "run_gemm_example.inc"
#include "run_gemm_example.inc"
int
main
(
int
argc
,
char
*
argv
[])
{
return
(
run_gemm_example
(
argc
,
argv
)
?
0
:
-
1
)
;
}
int
main
(
int
argc
,
char
*
argv
[])
{
return
!
run_gemm_example
(
argc
,
argv
);
}
example/01_gemm/gemm_xdl_lds_direct_load_fp32.cpp
View file @
3289a5c9
...
@@ -65,4 +65,4 @@ using ReferenceGemmInstanceGPU = ck::tensor_operation::device::ReferenceGemm<ALa
...
@@ -65,4 +65,4 @@ using ReferenceGemmInstanceGPU = ck::tensor_operation::device::ReferenceGemm<ALa
#include "run_gemm_example.inc"
#include "run_gemm_example.inc"
int
main
(
int
argc
,
char
*
argv
[])
{
return
(
run_gemm_example
(
argc
,
argv
)
?
0
:
-
1
)
;
}
int
main
(
int
argc
,
char
*
argv
[])
{
return
!
run_gemm_example
(
argc
,
argv
);
}
example/01_gemm/gemm_xdl_streamk.cpp
View file @
3289a5c9
...
@@ -57,4 +57,4 @@ using ReferenceGemmInstanceGPU = ck::tensor_operation::device::ReferenceGemm<ALa
...
@@ -57,4 +57,4 @@ using ReferenceGemmInstanceGPU = ck::tensor_operation::device::ReferenceGemm<ALa
#include "run_gemm_example.inc"
#include "run_gemm_example.inc"
int
main
(
int
argc
,
char
*
argv
[])
{
return
(
run_gemm_streamk_example
(
argc
,
argv
)
?
0
:
-
1
)
;
}
int
main
(
int
argc
,
char
*
argv
[])
{
return
!
run_gemm_streamk_example
(
argc
,
argv
);
}
example/01_gemm/gemm_xdl_wavelet_fp16.cpp
View file @
3289a5c9
...
@@ -50,4 +50,4 @@ using ReferenceGemmInstanceGPU = ck::tensor_operation::device::ReferenceGemm<ALa
...
@@ -50,4 +50,4 @@ using ReferenceGemmInstanceGPU = ck::tensor_operation::device::ReferenceGemm<ALa
#include "run_gemm_example.inc"
#include "run_gemm_example.inc"
int
main
(
int
argc
,
char
*
argv
[])
{
return
(
run_gemm_example
(
argc
,
argv
)
?
0
:
-
1
)
;
}
int
main
(
int
argc
,
char
*
argv
[])
{
return
!
run_gemm_example
(
argc
,
argv
);
}
example/01_gemm/run_gemm_example.inc
View file @
3289a5c9
...
@@ -166,14 +166,6 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
...
@@ -166,14 +166,6 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
ck
::
utils
::
FillUniformDistributionIntegerValue
<
ADataType
>
{
-
2.
f
,
2.
f
}(
a_m_k
);
ck
::
utils
::
FillUniformDistributionIntegerValue
<
ADataType
>
{
-
2.
f
,
2.
f
}(
a_m_k
);
ck
::
utils
::
FillUniformDistributionIntegerValue
<
BDataType
>
{
-
2.
f
,
2.
f
}(
b_k_n
);
ck
::
utils
::
FillUniformDistributionIntegerValue
<
BDataType
>
{
-
2.
f
,
2.
f
}(
b_k_n
);
break
;
break
;
case
6
:
a_m_k
.
GenerateTensorValue
(
GeneratorTensor_PI
<
ADataType
>
{});
b_k_n
.
GenerateTensorValue
(
GeneratorTensor_1
<
BDataType
>
{
1
});
break
;
case
7
:
a_m_k
.
GenerateTensorValue
(
GeneratorTensor_PI_A
<
ADataType
>
{});
b_k_n
.
GenerateTensorValue
(
GeneratorTensor_PI_B
<
BDataType
>
{});
break
;
default
:
default
:
ck
::
utils
::
FillUniformDistribution
<
ADataType
>
{
-
0.1
f
,
0.1
f
}(
a_m_k
);
ck
::
utils
::
FillUniformDistribution
<
ADataType
>
{
-
0.1
f
,
0.1
f
}(
a_m_k
);
ck
::
utils
::
FillUniformDistribution
<
BDataType
>
{
-
0.1
f
,
0.1
f
}(
b_k_n
);
ck
::
utils
::
FillUniformDistribution
<
BDataType
>
{
-
0.1
f
,
0.1
f
}(
b_k_n
);
...
@@ -256,7 +248,7 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
...
@@ -256,7 +248,7 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
{
{
std
::
cerr
<<
gemm
.
GetTypeString
()
<<
" does not support this problem"
<<
std
::
endl
;
std
::
cerr
<<
gemm
.
GetTypeString
()
<<
" does not support this problem"
<<
std
::
endl
;
return
fals
e
;
return
tru
e
;
}
}
ave_time
=
invoker
.
Run
(
argument
,
StreamConfig
{
nullptr
,
config
.
time_kernel
});
ave_time
=
invoker
.
Run
(
argument
,
StreamConfig
{
nullptr
,
config
.
time_kernel
});
...
@@ -289,7 +281,7 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
...
@@ -289,7 +281,7 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
{
{
std
::
cerr
<<
gemm
.
GetTypeString
()
<<
" does not support this problem"
<<
std
::
endl
;
std
::
cerr
<<
gemm
.
GetTypeString
()
<<
" does not support this problem"
<<
std
::
endl
;
return
fals
e
;
return
tru
e
;
}
}
std
::
size_t
workspace_size
=
gemm
.
GetWorkSpaceSize
(
&
argument
);
std
::
size_t
workspace_size
=
gemm
.
GetWorkSpaceSize
(
&
argument
);
...
@@ -322,26 +314,19 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
...
@@ -322,26 +314,19 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
std
::
cerr
<<
gemm
.
GetTypeString
()
<<
": the instance does not support the problem config."
std
::
cerr
<<
gemm
.
GetTypeString
()
<<
": the instance does not support the problem config."
<<
std
::
endl
;
<<
std
::
endl
;
return
fals
e
;
return
tru
e
;
}
}
if
(
config
.
time_kernel
)
std
::
size_t
flop
=
2_
uz
*
M
*
N
*
K
;
{
std
::
size_t
num_btype
=
std
::
size_t
flop
=
2_
uz
*
M
*
N
*
K
;
sizeof
(
ADataType
)
*
M
*
K
+
sizeof
(
BDataType
)
*
K
*
N
+
sizeof
(
CDataType
)
*
M
*
N
;
std
::
size_t
num_btype
=
sizeof
(
ADataType
)
*
M
*
K
+
sizeof
(
BDataType
)
*
K
*
N
+
sizeof
(
CDataType
)
*
M
*
N
;
float
tflops
=
static_cast
<
float
>
(
flop
)
/
1.E9
/
ave_time
;
float
tflops
=
static_cast
<
float
>
(
flop
)
/
1.E9
/
ave_time
;
float
gb_per_sec
=
num_btype
/
1.E6
/
ave_time
;
float
gb_per_sec
=
num_btype
/
1.E6
/
ave_time
;
std
::
cout
<<
"Perf: "
<<
ave_time
<<
" ms, "
<<
tflops
<<
" TFlops, "
<<
gb_per_sec
std
::
cout
<<
"Perf: "
<<
ave_time
<<
" ms, "
<<
tflops
<<
" TFlops, "
<<
gb_per_sec
<<
" GB/s, "
<<
" GB/s, "
<<
gemm
.
GetTypeString
()
<<
std
::
endl
;
<<
gemm
.
GetTypeString
()
<<
std
::
endl
;
}
else
{
std
::
cout
<<
"FINISHED: "
<<
gemm
.
GetTypeString
()
<<
std
::
endl
;
}
bool
pass
=
true
;
bool
pass
=
true
;
...
@@ -368,29 +353,12 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
...
@@ -368,29 +353,12 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
#else
#else
c_m_n_device_buf
.
FromDevice
(
c_m_n_device_result
.
mData
.
data
());
c_m_n_device_buf
.
FromDevice
(
c_m_n_device_result
.
mData
.
data
());
pass
=
ck
::
utils
::
check_err
(
c_m_n_device_result
,
pass
&
=
ck
::
utils
::
check_err
(
c_m_n_device_result
,
c_m_n_host_result
,
c_m_n_host_result
,
"Error: Incorrect results!"
,
"Error: Incorrect results!"
,
get_rtol
<
CDataType
>
(),
get_rtol
<
CDataType
>
(),
get_atol
<
CDataType
>
());
get_atol
<
CDataType
>
());
#endif
#endif
if
(
pass
)
std
::
cout
<<
"Verification on CPU: PASS"
<<
std
::
endl
;
if
(
config
.
init_method
==
6
||
config
.
init_method
==
7
)
{
std
::
cout
<<
std
::
fixed
<<
std
::
setprecision
(
16
);
AccDataType
d
=
ck
::
type_convert
<
AccDataType
>
(
c_m_n_device_result
(
0
,
10
));
AccDataType
h
=
ck
::
type_convert
<
AccDataType
>
(
c_m_n_host_result
(
10
,
0
));
std
::
cout
<<
"device result: "
<<
d
<<
std
::
endl
;
std
::
cout
<<
"host result: "
<<
h
<<
std
::
endl
;
std
::
cout
<<
"expected result: "
<<
M_PI
<<
std
::
endl
;
std
::
cout
<<
"device - host: "
<<
std
::
abs
(
d
-
h
)
<<
std
::
endl
;
std
::
cout
<<
"device - expected: "
<<
std
::
abs
(
d
-
M_PI
)
<<
std
::
endl
;
std
::
cout
<<
"atol: "
<<
get_atol
<
CDataType
>
()
<<
std
::
endl
;
std
::
cout
<<
std
::
endl
<<
std
::
endl
;
}
}
}
if
((
config
.
do_verification
==
2
)
||
(
config
.
do_verification
==
3
))
if
((
config
.
do_verification
==
2
)
||
(
config
.
do_verification
==
3
))
...
@@ -416,18 +384,14 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
...
@@ -416,18 +384,14 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
c_m_n_device_ref_buf
.
FromDevice
(
c_m_n_device_ref_result
.
mData
.
data
());
c_m_n_device_ref_buf
.
FromDevice
(
c_m_n_device_ref_result
.
mData
.
data
());
c_m_n_device_buf
.
FromDevice
(
c_m_n_device_result
.
mData
.
data
());
c_m_n_device_buf
.
FromDevice
(
c_m_n_device_result
.
mData
.
data
());
auto
gpu_pass
=
ck
::
utils
::
check_err
(
c_m_n_device_result
,
pass
&=
ck
::
utils
::
check_err
(
c_m_n_device_result
,
c_m_n_device_ref_result
,
c_m_n_device_ref_result
,
"Error: Incorrect results!"
,
"Error: Incorrect results!"
,
get_rtol
<
CDataType
>
(),
get_rtol
<
CDataType
>
(),
get_atol
<
CDataType
>
());
get_atol
<
CDataType
>
());
if
(
gpu_pass
)
std
::
cout
<<
"Verification on GPU: PASS"
<<
std
::
endl
;
pass
=
pass
&&
gpu_pass
;
}
}
return
pass
;
return
pass
==
true
;
}
}
bool
run_gemm_example
(
int
argc
,
char
*
argv
[])
bool
run_gemm_example
(
int
argc
,
char
*
argv
[])
...
@@ -435,7 +399,7 @@ bool run_gemm_example(int argc, char* argv[])
...
@@ -435,7 +399,7 @@ bool run_gemm_example(int argc, char* argv[])
ProblemSize
problem_size
;
ProblemSize
problem_size
;
ExecutionConfig
config
;
ExecutionConfig
config
;
return
parse_cmd_args
(
argc
,
argv
,
problem_size
,
config
)
&&
run_gemm
(
problem_size
,
config
);
return
!
parse_cmd_args
(
argc
,
argv
,
problem_size
,
config
)
||
run_gemm
(
problem_size
,
config
);
}
}
bool
run_gemm_streamk_example
(
int
argc
,
char
*
argv
[])
bool
run_gemm_streamk_example
(
int
argc
,
char
*
argv
[])
...
@@ -443,5 +407,5 @@ bool run_gemm_streamk_example(int argc, char* argv[])
...
@@ -443,5 +407,5 @@ bool run_gemm_streamk_example(int argc, char* argv[])
ProblemSizeStreamK
problem_size
;
ProblemSizeStreamK
problem_size
;
ExecutionConfig
config
;
ExecutionConfig
config
;
return
parse_cmd_args
(
argc
,
argv
,
problem_size
,
config
)
&&
run_gemm
(
problem_size
,
config
);
return
!
parse_cmd_args
(
argc
,
argv
,
problem_size
,
config
)
||
run_gemm
(
problem_size
,
config
);
}
}
example/01_gemm/run_gemm_example_streamk_v2.inc
View file @
3289a5c9
...
@@ -162,12 +162,12 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
...
@@ -162,12 +162,12 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
b_k_n
.
GenerateTensorValue
(
GeneratorTensor_2
<
BDataType
>
{
-
2
,
2
});
b_k_n
.
GenerateTensorValue
(
GeneratorTensor_2
<
BDataType
>
{
-
2
,
2
});
break
;
break
;
case
2
:
case
2
:
a_m_k
.
GenerateTensorValue
(
GeneratorTensor_
3
<
ADataType
>
{
0.0
,
1.0
});
a_m_k
.
GenerateTensorValue
(
GeneratorTensor_
1
<
ADataType
>
{
1
});
b_k_n
.
GenerateTensorValue
(
GeneratorTensor_
3
<
BDataType
>
{
-
2
,
2
});
b_k_n
.
GenerateTensorValue
(
GeneratorTensor_
2
<
BDataType
>
{
-
2
,
2
});
break
;
break
;
case
3
:
case
3
:
a_m_k
.
GenerateTensorValue
(
GeneratorTensor_
3
<
ADataType
>
{
-
2
,
2
});
a_m_k
.
GenerateTensorValue
(
GeneratorTensor_
2
<
ADataType
>
{
-
2
,
2
});
b_k_n
.
GenerateTensorValue
(
GeneratorTensor_
3
<
A
DataType
>
{
0.0
,
1.0
});
b_k_n
.
GenerateTensorValue
(
GeneratorTensor_
1
<
B
DataType
>
{
1
});
break
;
break
;
default
:
default
:
a_m_k
.
GenerateTensorValue
(
GeneratorTensor_3
<
ADataType
>
{
0.0
,
1.0
});
a_m_k
.
GenerateTensorValue
(
GeneratorTensor_3
<
ADataType
>
{
0.0
,
1.0
});
...
@@ -237,13 +237,12 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
...
@@ -237,13 +237,12 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
{
{
std
::
cerr
<<
gemm
.
GetTypeString
()
<<
" does not support this problem"
<<
std
::
endl
;
std
::
cerr
<<
gemm
.
GetTypeString
()
<<
" does not support this problem"
<<
std
::
endl
;
return
fals
e
;
return
tru
e
;
}
}
bool
pass
=
true
;
bool
pass
=
true
;
if
((
config
.
do_verification
==
1
)
||
(
config
.
do_verification
==
3
))
if
((
config
.
do_verification
==
1
)
||
(
config
.
do_verification
==
3
))
{
{
std
::
cout
<<
"Compute reference GEMM on CPU... "
;
auto
ref_gemm
=
ReferenceGemmInstance
{};
auto
ref_gemm
=
ReferenceGemmInstance
{};
auto
ref_invoker
=
ref_gemm
.
MakeInvoker
();
auto
ref_invoker
=
ref_gemm
.
MakeInvoker
();
...
@@ -251,11 +250,8 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
...
@@ -251,11 +250,8 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
a_m_k
,
b_k_n
,
c_m_n_host_result
,
PassThrough
{},
PassThrough
{},
PassThrough
{});
a_m_k
,
b_k_n
,
c_m_n_host_result
,
PassThrough
{},
PassThrough
{},
PassThrough
{});
ref_invoker
.
Run
(
ref_argument
);
ref_invoker
.
Run
(
ref_argument
);
std
::
cout
<<
"DONE!"
<<
std
::
endl
;
std
::
cout
<<
"Compute GEMM on device...
\n
"
;
ave_time
=
invoker
.
Run
(
argument
,
StreamConfig
{
nullptr
,
false
,
1
});
ave_time
=
invoker
.
Run
(
argument
,
StreamConfig
{
nullptr
,
false
,
1
});
std
::
cout
<<
"DONE!"
<<
std
::
endl
;
#ifdef BUILD_INT4_EXAMPLE
#ifdef BUILD_INT4_EXAMPLE
Tensor
<
CDataType
>
c_m_n_device_result_converted
(
c_m_n_host_result
.
mDesc
);
Tensor
<
CDataType
>
c_m_n_device_result_converted
(
c_m_n_host_result
.
mDesc
);
...
@@ -267,19 +263,16 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
...
@@ -267,19 +263,16 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
#else
#else
c_m_n_device_buf
.
FromDevice
(
c_m_n_device_result
.
mData
.
data
());
c_m_n_device_buf
.
FromDevice
(
c_m_n_device_result
.
mData
.
data
());
pass
=
ck
::
utils
::
check_err
(
c_m_n_device_result
,
pass
&=
ck
::
utils
::
check_err
(
c_m_n_device_result
,
c_m_n_host_result
,
c_m_n_host_result
,
"Error: Incorrect results!"
,
"Error: Incorrect results!"
,
get_rtol
<
CDataType
>
(),
get_rtol
<
CDataType
>
(),
get_atol
<
CDataType
>
());
get_atol
<
CDataType
>
());
if
(
pass
)
std
::
cout
<<
"Verification on CPU: PASS"
<<
std
::
endl
;
#endif
#endif
}
}
if
(
config
.
time_kernel
)
if
(
config
.
time_kernel
)
{
{
std
::
cout
<<
"Time GEMM on device...
\n
"
;
ave_time
=
invoker
.
Run
(
argument
,
StreamConfig
{
nullptr
,
config
.
time_kernel
});
ave_time
=
invoker
.
Run
(
argument
,
StreamConfig
{
nullptr
,
config
.
time_kernel
});
std
::
size_t
flop
=
2_
uz
*
M
*
N
*
K
;
std
::
size_t
flop
=
2_
uz
*
M
*
N
*
K
;
...
@@ -301,5 +294,5 @@ bool run_gemm_universal_streamk_example(int argc, char* argv[])
...
@@ -301,5 +294,5 @@ bool run_gemm_universal_streamk_example(int argc, char* argv[])
ProblemSizeStreamK_universal
problem_size
;
ProblemSizeStreamK_universal
problem_size
;
ExecutionConfig
config
;
ExecutionConfig
config
;
return
parse_cmd_args
(
argc
,
argv
,
problem_size
,
config
)
&&
run_gemm
(
problem_size
,
config
);
return
!
parse_cmd_args
(
argc
,
argv
,
problem_size
,
config
)
||
run_gemm
(
problem_size
,
config
);
}
}
example/01_gemm/run_gemm_example_v2.inc
View file @
3289a5c9
// SPDX-License-Identifier: MIT
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-202
4
, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-202
3
, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#pragma once
...
@@ -146,11 +146,19 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
...
@@ -146,11 +146,19 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
b_k_n
.
GenerateTensorValue
(
GeneratorTensor_1
<
BDataType
>
{
1
});
b_k_n
.
GenerateTensorValue
(
GeneratorTensor_1
<
BDataType
>
{
1
});
break
;
break
;
case
1
:
case
1
:
a_m_k
.
GenerateTensorValue
(
GeneratorTensor_3
<
ADataType
>
{
-
2
,
2
});
a_m_k
.
GenerateTensorValue
(
GeneratorTensor_2
<
ADataType
>
{
-
2
,
2
});
b_k_n
.
GenerateTensorValue
(
GeneratorTensor_3
<
BDataType
>
{
-
3
,
3
});
b_k_n
.
GenerateTensorValue
(
GeneratorTensor_2
<
BDataType
>
{
-
2
,
2
});
break
;
case
2
:
a_m_k
.
GenerateTensorValue
(
GeneratorTensor_1
<
ADataType
>
{
1
});
b_k_n
.
GenerateTensorValue
(
GeneratorTensor_2
<
BDataType
>
{
-
2
,
2
});
break
;
case
3
:
a_m_k
.
GenerateTensorValue
(
GeneratorTensor_2
<
ADataType
>
{
-
2
,
2
});
b_k_n
.
GenerateTensorValue
(
GeneratorTensor_1
<
BDataType
>
{
1
});
break
;
break
;
default
:
default
:
a_m_k
.
GenerateTensorValue
(
GeneratorTensor_3
<
ADataType
>
{
-
1
.0
,
1.0
});
a_m_k
.
GenerateTensorValue
(
GeneratorTensor_3
<
ADataType
>
{
0
.0
,
1.0
});
b_k_n
.
GenerateTensorValue
(
GeneratorTensor_3
<
BDataType
>
{
-
0.5
,
0.5
});
b_k_n
.
GenerateTensorValue
(
GeneratorTensor_3
<
BDataType
>
{
-
0.5
,
0.5
});
}
}
...
@@ -216,13 +224,12 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
...
@@ -216,13 +224,12 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
{
{
std
::
cerr
<<
gemm
.
GetTypeString
()
<<
" does not support this problem"
<<
std
::
endl
;
std
::
cerr
<<
gemm
.
GetTypeString
()
<<
" does not support this problem"
<<
std
::
endl
;
return
fals
e
;
return
tru
e
;
}
}
bool
pass
=
true
;
bool
pass
=
true
;
if
((
config
.
do_verification
==
1
)
||
(
config
.
do_verification
==
3
))
if
((
config
.
do_verification
==
1
)
||
(
config
.
do_verification
==
3
))
{
{
std
::
cout
<<
"Compute reference GEMM on CPU... "
;
auto
ref_gemm
=
ReferenceGemmInstance
{};
auto
ref_gemm
=
ReferenceGemmInstance
{};
auto
ref_invoker
=
ref_gemm
.
MakeInvoker
();
auto
ref_invoker
=
ref_gemm
.
MakeInvoker
();
...
@@ -230,11 +237,8 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
...
@@ -230,11 +237,8 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
a_m_k
,
b_k_n
,
c_m_n_host_result
,
PassThrough
{},
PassThrough
{},
PassThrough
{});
a_m_k
,
b_k_n
,
c_m_n_host_result
,
PassThrough
{},
PassThrough
{},
PassThrough
{});
ref_invoker
.
Run
(
ref_argument
);
ref_invoker
.
Run
(
ref_argument
);
std
::
cout
<<
"DONE!"
<<
std
::
endl
;
std
::
cout
<<
"Compute GEMM on device...
\n
"
;
ave_time
=
invoker
.
Run
(
argument
,
StreamConfig
{
nullptr
,
false
,
1
});
ave_time
=
invoker
.
Run
(
argument
,
StreamConfig
{
nullptr
,
false
,
1
});
std
::
cout
<<
"DONE!"
<<
std
::
endl
;
#ifdef BUILD_INT4_EXAMPLE
#ifdef BUILD_INT4_EXAMPLE
Tensor
<
CDataType
>
c_m_n_device_result_converted
(
c_m_n_host_result
.
mDesc
);
Tensor
<
CDataType
>
c_m_n_device_result_converted
(
c_m_n_host_result
.
mDesc
);
...
@@ -246,19 +250,16 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
...
@@ -246,19 +250,16 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
#else
#else
c_m_n_device_buf
.
FromDevice
(
c_m_n_device_result
.
mData
.
data
());
c_m_n_device_buf
.
FromDevice
(
c_m_n_device_result
.
mData
.
data
());
pass
=
ck
::
utils
::
check_err
(
c_m_n_device_result
,
pass
&=
ck
::
utils
::
check_err
(
c_m_n_device_result
,
c_m_n_host_result
,
c_m_n_host_result
,
"Error: Incorrect results!"
,
"Error: Incorrect results!"
,
get_rtol
<
CDataType
>
(),
get_rtol
<
CDataType
>
(),
get_atol
<
CDataType
>
());
get_atol
<
CDataType
>
());
if
(
pass
)
std
::
cout
<<
"Verification on CPU: PASS"
<<
std
::
endl
;
#endif
#endif
}
}
if
(
config
.
time_kernel
)
if
(
config
.
time_kernel
)
{
{
std
::
cout
<<
"Time GEMM on device...
\n
"
;
ave_time
=
ave_time
=
invoker
.
Run
(
argument
,
StreamConfig
{
nullptr
,
config
.
time_kernel
,
0
,
5
,
10
,
true
,
4
});
invoker
.
Run
(
argument
,
StreamConfig
{
nullptr
,
config
.
time_kernel
,
0
,
5
,
10
,
true
,
4
});
...
@@ -281,5 +282,5 @@ bool run_gemm_splitk_example(int argc, char* argv[])
...
@@ -281,5 +282,5 @@ bool run_gemm_splitk_example(int argc, char* argv[])
ProblemSizeSplitK
problem_size
;
ProblemSizeSplitK
problem_size
;
ExecutionConfig
config
;
ExecutionConfig
config
;
return
parse_cmd_args
(
argc
,
argv
,
problem_size
,
config
)
&&
run_gemm
(
problem_size
,
config
);
return
!
parse_cmd_args
(
argc
,
argv
,
problem_size
,
config
)
||
run_gemm
(
problem_size
,
config
);
}
}
example/04_gemm_add_add_fastgelu/common.hpp
View file @
3289a5c9
// SPDX-License-Identifier: MIT
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-202
4
, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-202
3
, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#pragma once
...
@@ -57,7 +57,7 @@ struct ProblemSize final
...
@@ -57,7 +57,7 @@ struct ProblemSize final
struct
ExecutionConfig
final
struct
ExecutionConfig
final
{
{
bool
do_verification
=
true
;
bool
do_verification
=
true
;
int
init_method
=
2
;
int
init_method
=
1
;
bool
time_kernel
=
false
;
bool
time_kernel
=
false
;
};
};
...
...
example/04_gemm_add_add_fastgelu/gemm_add_add_fastgelu_xdl_bf16.cpp
View file @
3289a5c9
// SPDX-License-Identifier: MIT
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-202
4
, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-202
3
, Advanced Micro Devices, Inc. All rights reserved.
#include "common.hpp"
#include "common.hpp"
...
@@ -7,7 +7,7 @@ using ADataType = BF16;
...
@@ -7,7 +7,7 @@ using ADataType = BF16;
using
BDataType
=
BF16
;
using
BDataType
=
BF16
;
using
AccDataType
=
F32
;
using
AccDataType
=
F32
;
using
CShuffleDataType
=
F32
;
using
CShuffleDataType
=
F32
;
using
CDataType
=
F32
;
// C matrix doesn't ex
i
st in GPU memory, this is used for host verification
using
CDataType
=
F32
;
// C matrix doesn't exs
i
t in GPU memory, this is used for host verification
using
D0DataType
=
BF16
;
using
D0DataType
=
BF16
;
using
D1DataType
=
BF16
;
using
D1DataType
=
BF16
;
using
DsDataType
=
ck
::
Tuple
<
D0DataType
,
D1DataType
>
;
using
DsDataType
=
ck
::
Tuple
<
D0DataType
,
D1DataType
>
;
...
...
example/04_gemm_add_add_fastgelu/run_gemm_add_add_fastgelu_example.inc
View file @
3289a5c9
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#pragma once
template
<
typename
DataType
>
inline
__host__
__device__
constexpr
double
get_rtol
()
{
if
constexpr
(
std
::
is_same_v
<
DataType
,
float
>
)
{
return
1
e
-
3
;
}
else
if
constexpr
(
std
::
is_same_v
<
DataType
,
double
>
)
{
return
1
e
-
6
;
}
else
if
constexpr
(
std
::
is_same_v
<
DataType
,
ck
::
half_t
>
)
{
return
1
e
-
3
;
}
else
if
constexpr
(
std
::
is_same_v
<
DataType
,
ck
::
bhalf_t
>
)
{
return
5
e
-
2
;
}
else
if
constexpr
(
std
::
is_same_v
<
DataType
,
int32_t
>
)
{
return
1
e
-
1
;
}
else
if
constexpr
(
std
::
is_same_v
<
DataType
,
int8_t
>
)
{
return
1
e
-
1
;
}
else
if
constexpr
(
std
::
is_same_v
<
DataType
,
ck
::
f8_t
>
)
{
return
2
e
-
1
;
}
else
if
constexpr
(
std
::
is_same_v
<
DataType
,
ck
::
bf8_t
>
)
{
return
2
e
-
1
;
}
else
{
return
1
e
-
3
;
}
}
template
<
typename
DataType
>
inline
__host__
__device__
constexpr
double
get_atol
()
{
if
constexpr
(
std
::
is_same_v
<
DataType
,
float
>
)
{
return
1
e
-
3
;
}
else
if
constexpr
(
std
::
is_same_v
<
DataType
,
double
>
)
{
return
1
e
-
6
;
}
else
if
constexpr
(
std
::
is_same_v
<
DataType
,
ck
::
half_t
>
)
{
return
1
e
-
3
;
}
else
if
constexpr
(
std
::
is_same_v
<
DataType
,
ck
::
bhalf_t
>
)
{
return
5
e
-
2
;
}
else
if
constexpr
(
std
::
is_same_v
<
DataType
,
int32_t
>
)
{
return
1
e
-
1
;
}
else
if
constexpr
(
std
::
is_same_v
<
DataType
,
int8_t
>
)
{
return
1
e
-
1
;
}
else
if
constexpr
(
std
::
is_same_v
<
DataType
,
ck
::
f8_t
>
)
{
return
2
e
-
1
;
}
else
if
constexpr
(
std
::
is_same_v
<
DataType
,
ck
::
bf8_t
>
)
{
return
2
e
-
1
;
}
else
{
return
1
e
-
3
;
}
}
bool
run_gemm_add_add_fastgelu
(
const
ProblemSize
&
problem_size
,
const
ExecutionConfig
&
config
)
bool
run_gemm_add_add_fastgelu
(
const
ProblemSize
&
problem_size
,
const
ExecutionConfig
&
config
)
{
{
#if defined(BUILD_INT4_EXAMPLE) && defined(CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4)
#if defined(BUILD_INT4_EXAMPLE) && defined(CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4)
...
@@ -235,11 +150,7 @@ bool run_gemm_add_add_fastgelu(const ProblemSize& problem_size, const ExecutionC
...
@@ -235,11 +150,7 @@ bool run_gemm_add_add_fastgelu(const ProblemSize& problem_size, const ExecutionC
return
ck
::
utils
::
check_err
(
e_m_n_device_result_converted
,
e_m_n_host_result
);
return
ck
::
utils
::
check_err
(
e_m_n_device_result_converted
,
e_m_n_host_result
);
#else
#else
return
ck
::
utils
::
check_err
(
e_m_n_device_result
,
return
ck
::
utils
::
check_err
(
e_m_n_device_result
,
e_m_n_host_result
);
e_m_n_host_result
,
"Error: Incorrect results!"
,
get_rtol
<
EDataType
>
(),
get_atol
<
EDataType
>
());
#endif
#endif
}
}
...
...
example/15_grouped_gemm/grouped_gemm_xdl_fixed_nk_fp16.cpp
View file @
3289a5c9
...
@@ -157,8 +157,8 @@ bool run_grouped_gemm(const ProblemSize& problem_size, const ExecutionConfig& co
...
@@ -157,8 +157,8 @@ bool run_grouped_gemm(const ProblemSize& problem_size, const ExecutionConfig& co
b_tensors
[
i
].
GenerateTensorValue
(
GeneratorTensor_3
<
BDataType
>
{
-
0.5
,
0.5
});
b_tensors
[
i
].
GenerateTensorValue
(
GeneratorTensor_3
<
BDataType
>
{
-
0.5
,
0.5
});
break
;
break
;
default:
default:
a_tensors
[
i
].
GenerateTensorValue
(
GeneratorTensor_
1
<
ADataType
>
{
1.0
});
a_tensors
[
i
].
GenerateTensorValue
(
GeneratorTensor_
Sequential
<
ADataType
,
0
>
{
});
b_tensors
[
i
].
GenerateTensorValue
(
GeneratorTensor_
1
<
BDataType
>
{
1.0
});
b_tensors
[
i
].
GenerateTensorValue
(
GeneratorTensor_
Sequential
<
BDataType
,
1
>
{
});
}
}
}
}
...
...
example/15_grouped_gemm/grouped_gemm_xdl_fixed_nk_fp16_fp8.cpp
View file @
3289a5c9
...
@@ -75,7 +75,7 @@ struct ProblemSize final
...
@@ -75,7 +75,7 @@ struct ProblemSize final
struct
ExecutionConfig
final
struct
ExecutionConfig
final
{
{
bool
do_verification
=
true
;
bool
do_verification
=
true
;
int
init_method
=
2
;
int
init_method
=
1
;
int
k_batch
=
1
;
int
k_batch
=
1
;
bool
time_kernel
=
false
;
bool
time_kernel
=
false
;
};
};
...
@@ -154,12 +154,12 @@ bool run_grouped_gemm(const ProblemSize& problem_size, const ExecutionConfig& co
...
@@ -154,12 +154,12 @@ bool run_grouped_gemm(const ProblemSize& problem_size, const ExecutionConfig& co
b_tensors
[
i
].
GenerateTensorValue
(
GeneratorTensor_2
<
BDataType
>
{
-
5
,
5
});
b_tensors
[
i
].
GenerateTensorValue
(
GeneratorTensor_2
<
BDataType
>
{
-
5
,
5
});
break
;
break
;
case
2
:
case
2
:
a_tensors
[
i
].
GenerateTensorValue
(
GeneratorTensor_3
<
ADataType
>
{
-
1
.0
,
1.0
});
a_tensors
[
i
].
GenerateTensorValue
(
GeneratorTensor_3
<
ADataType
>
{
0
.0
,
1.0
});
b_tensors
[
i
].
GenerateTensorValue
(
GeneratorTensor_3
<
BDataType
>
{
-
0.5
,
0.5
});
b_tensors
[
i
].
GenerateTensorValue
(
GeneratorTensor_3
<
BDataType
>
{
-
0.5
,
0.5
});
break
;
break
;
default:
default:
a_tensors
[
i
].
GenerateTensorValue
(
GeneratorTensor_
1
<
ADataType
>
{
1.0
});
a_tensors
[
i
].
GenerateTensorValue
(
GeneratorTensor_
Sequential
<
ADataType
,
0
>
{
});
b_tensors
[
i
].
GenerateTensorValue
(
GeneratorTensor_
1
<
BDataType
>
{
1.0
});
b_tensors
[
i
].
GenerateTensorValue
(
GeneratorTensor_
Sequential
<
BDataType
,
1
>
{
});
}
}
}
}
...
@@ -266,7 +266,6 @@ bool run_grouped_gemm(const ProblemSize& problem_size, const ExecutionConfig& co
...
@@ -266,7 +266,6 @@ bool run_grouped_gemm(const ProblemSize& problem_size, const ExecutionConfig& co
BElementOp
,
BElementOp
,
CDEElementOp
>
;
CDEElementOp
>
;
std
::
cout
<<
"Running verification on CPU."
<<
std
::
endl
;
for
(
std
::
size_t
i
=
0
;
i
<
gemm_descs
.
size
();
i
++
)
for
(
std
::
size_t
i
=
0
;
i
<
gemm_descs
.
size
();
i
++
)
{
{
c_tensors_device
[
i
]
->
FromDevice
(
c_device_tensors
[
i
].
mData
.
data
(),
c_tensors_device
[
i
]
->
FromDevice
(
c_device_tensors
[
i
].
mData
.
data
(),
...
@@ -286,9 +285,6 @@ bool run_grouped_gemm(const ProblemSize& problem_size, const ExecutionConfig& co
...
@@ -286,9 +285,6 @@ bool run_grouped_gemm(const ProblemSize& problem_size, const ExecutionConfig& co
pass
&=
ck
::
utils
::
check_err
(
c_device_tensors
[
i
],
c_host_tensors
[
i
]);
pass
&=
ck
::
utils
::
check_err
(
c_device_tensors
[
i
],
c_host_tensors
[
i
]);
}
}
if
(
pass
)
std
::
cout
<<
"Verification on CPU: PASS"
<<
std
::
endl
;
}
}
return
pass
;
return
pass
;
...
...
example/15_grouped_gemm/run_grouped_gemm_example.inc
View file @
3289a5c9
// SPDX-License-Identifier: MIT
// SPDX-License-Identifier: MIT
// Copyright (c)
2018-
2024, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#pragma once
...
@@ -123,12 +123,12 @@ bool run_grouped_gemm(const ProblemSize& problem_size, const ExecutionConfig& co
...
@@ -123,12 +123,12 @@ bool run_grouped_gemm(const ProblemSize& problem_size, const ExecutionConfig& co
b_tensors
[
i
]
.
GenerateTensorValue
(
GeneratorTensor_2
<
BDataType
>
{
-
5
,
5
});
b_tensors
[
i
]
.
GenerateTensorValue
(
GeneratorTensor_2
<
BDataType
>
{
-
5
,
5
});
break
;
break
;
case
2
:
case
2
:
a_tensors
[
i
]
.
GenerateTensorValue
(
GeneratorTensor_3
<
ADataType
>
{
-
1
.0
,
1.0
});
a_tensors
[
i
]
.
GenerateTensorValue
(
GeneratorTensor_3
<
ADataType
>
{
0
.0
,
1.0
});
b_tensors
[
i
]
.
GenerateTensorValue
(
GeneratorTensor_3
<
BDataType
>
{
-
0.5
,
0.5
});
b_tensors
[
i
]
.
GenerateTensorValue
(
GeneratorTensor_3
<
BDataType
>
{
-
0.5
,
0.5
});
break
;
break
;
default
:
default
:
a_tensors
[
i
]
.
GenerateTensorValue
(
GeneratorTensor_
1
<
ADataType
>
{
1
});
a_tensors
[
i
]
.
GenerateTensorValue
(
GeneratorTensor_
Sequential
<
ADataType
,
0
>
{});
b_tensors
[
i
]
.
GenerateTensorValue
(
GeneratorTensor_
1
<
BDataType
>
{
1
});
b_tensors
[
i
]
.
GenerateTensorValue
(
GeneratorTensor_
Sequential
<
BDataType
,
1
>
{});
}
}
}
}
...
@@ -187,7 +187,6 @@ bool run_grouped_gemm(const ProblemSize& problem_size, const ExecutionConfig& co
...
@@ -187,7 +187,6 @@ bool run_grouped_gemm(const ProblemSize& problem_size, const ExecutionConfig& co
bool
pass
=
true
;
bool
pass
=
true
;
if
(
config
.
do_verification
)
if
(
config
.
do_verification
)
{
{
std
::
cout
<<
"Running verification on CPU."
<<
std
::
endl
;
using
ReferenceGemmInstance
=
ck
::
tensor_operation
::
host
::
ReferenceGemm
<
ADataType
,
using
ReferenceGemmInstance
=
ck
::
tensor_operation
::
host
::
ReferenceGemm
<
ADataType
,
BDataType
,
BDataType
,
EDataType
,
EDataType
,
...
@@ -219,8 +218,6 @@ bool run_grouped_gemm(const ProblemSize& problem_size, const ExecutionConfig& co
...
@@ -219,8 +218,6 @@ bool run_grouped_gemm(const ProblemSize& problem_size, const ExecutionConfig& co
pass
&=
ck
::
utils
::
check_err
(
c_device_tensors
[
i
],
c_host_tensors
[
i
]);
pass
&=
ck
::
utils
::
check_err
(
c_device_tensors
[
i
],
c_host_tensors
[
i
]);
#endif
#endif
}
}
if
(
pass
)
std
::
cout
<<
"Verification on CPU: PASS"
<<
std
::
endl
;
}
}
if
(
config
.
time_kernel
)
if
(
config
.
time_kernel
)
...
...
example/20_grouped_conv_bwd_weight/common.hpp
View file @
3289a5c9
// SPDX-License-Identifier: MIT
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-202
4
, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-202
3
, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#pragma once
...
@@ -72,7 +72,7 @@ using OutputLayout = typename CommonLayoutSettingSelector<NDimSpatial>::OutputLa
...
@@ -72,7 +72,7 @@ using OutputLayout = typename CommonLayoutSettingSelector<NDimSpatial>::OutputLa
struct
ExecutionConfig
final
struct
ExecutionConfig
final
{
{
bool
do_verification
=
true
;
bool
do_verification
=
true
;
int
init_method
=
2
;
int
init_method
=
1
;
bool
time_kernel
=
false
;
bool
time_kernel
=
false
;
};
};
...
...
example/20_grouped_conv_bwd_weight/run_grouped_conv_bwd_weight_example.inc
View file @
3289a5c9
// SPDX-License-Identifier: MIT
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-202
4
, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-202
3
, Advanced Micro Devices, Inc. All rights reserved.
template
<
ck
::
index_t
NDimSpatial
>
template
<
ck
::
index_t
NDimSpatial
>
bool
run_grouped_conv_bwd_weight
(
const
ExecutionConfig
&
config
,
bool
run_grouped_conv_bwd_weight
(
const
ExecutionConfig
&
config
,
...
@@ -37,8 +37,8 @@ bool run_grouped_conv_bwd_weight(const ExecutionConfig& config,
...
@@ -37,8 +37,8 @@ bool run_grouped_conv_bwd_weight(const ExecutionConfig& config,
out
.
GenerateTensorValue
(
GeneratorTensor_2
<
OutDataType
>
{
-
5
,
5
});
out
.
GenerateTensorValue
(
GeneratorTensor_2
<
OutDataType
>
{
-
5
,
5
});
break
;
break
;
default
:
default
:
in
.
GenerateTensorValue
(
GeneratorTensor_3
<
InDataType
>
{
0.0
,
1.0
});
in
.
GenerateTensorValue
(
GeneratorTensor_3
<
InDataType
>
{
0.0
,
0.2
});
out
.
GenerateTensorValue
(
GeneratorTensor_3
<
OutDataType
>
{
-
1
,
1
});
out
.
GenerateTensorValue
(
GeneratorTensor_3
<
OutDataType
>
{
-
0.
1
,
0.
1
});
}
}
DeviceMem
in_device_buf
(
sizeof
(
InDataType
)
*
in
.
mDesc
.
GetElementSpaceSize
());
DeviceMem
in_device_buf
(
sizeof
(
InDataType
)
*
in
.
mDesc
.
GetElementSpaceSize
());
...
@@ -128,12 +128,7 @@ bool run_grouped_conv_bwd_weight(const ExecutionConfig& config,
...
@@ -128,12 +128,7 @@ bool run_grouped_conv_bwd_weight(const ExecutionConfig& config,
wei_device_buf
.
FromDevice
(
wei_device_result
.
mData
.
data
());
wei_device_buf
.
FromDevice
(
wei_device_result
.
mData
.
data
());
return
ck
::
utils
::
check_err
(
return
ck
::
utils
::
check_err
(
wei_device_result
.
mData
,
wei_host_result
.
mData
);
wei_device_result
.
mData
,
wei_host_result
.
mData
,
"Error: Incorrect results!"
,
1
e
-
3
,
1
e
-
3
);
// the errors must be consistent with the less precise type of In/Out DataTypes
}
}
float
avg_time
=
invoker
.
Run
(
argument
,
StreamConfig
{
nullptr
,
config
.
time_kernel
});
float
avg_time
=
invoker
.
Run
(
argument
,
StreamConfig
{
nullptr
,
config
.
time_kernel
});
...
...
example/21_gemm_layernorm/gemm_xdl_layernorm_naive_single_kernel_fp16.cpp
View file @
3289a5c9
...
@@ -175,8 +175,8 @@ int main(int argc, char* argv[])
...
@@ -175,8 +175,8 @@ int main(int argc, char* argv[])
b_k_n
.
GenerateTensorValue
(
GeneratorTensor_3
<
BDataType
>
{
-
0.5
,
0.5
});
b_k_n
.
GenerateTensorValue
(
GeneratorTensor_3
<
BDataType
>
{
-
0.5
,
0.5
});
break
;
break
;
default:
default:
a_m_k
.
GenerateTensorValue
(
GeneratorTensor_
1
<
ADataType
>
{
1
});
a_m_k
.
GenerateTensorValue
(
GeneratorTensor_
Sequential
<
ADataType
,
0
>
{});
b_k_n
.
GenerateTensorValue
(
GeneratorTensor_
1
<
BDataType
>
{
1
});
b_k_n
.
GenerateTensorValue
(
GeneratorTensor_
Sequential
<
BDataType
,
1
>
{});
}
}
c0_n_bias
.
GenerateTensorValue
(
GeneratorTensor_2
<
C0DataType
>
{
-
5
,
5
});
c0_n_bias
.
GenerateTensorValue
(
GeneratorTensor_2
<
C0DataType
>
{
-
5
,
5
});
...
...
example/35_splitK_gemm/run_splitK_gemm_example.inc
View file @
3289a5c9
...
@@ -3,88 +3,6 @@
...
@@ -3,88 +3,6 @@
#pragma once
#pragma once
template
<
typename
DataType
>
inline
__host__
__device__
constexpr
double
get_rtol
()
{
if
constexpr
(
std
::
is_same_v
<
DataType
,
float
>
)
{
return
1
e
-
3
;
}
else
if
constexpr
(
std
::
is_same_v
<
DataType
,
double
>
)
{
return
1
e
-
6
;
}
else
if
constexpr
(
std
::
is_same_v
<
DataType
,
ck
::
half_t
>
)
{
return
1
e
-
3
;
}
else
if
constexpr
(
std
::
is_same_v
<
DataType
,
ck
::
bhalf_t
>
)
{
return
5
e
-
2
;
}
else
if
constexpr
(
std
::
is_same_v
<
DataType
,
int32_t
>
)
{
return
1
e
-
1
;
}
else
if
constexpr
(
std
::
is_same_v
<
DataType
,
int8_t
>
)
{
return
1
e
-
1
;
}
else
if
constexpr
(
std
::
is_same_v
<
DataType
,
ck
::
f8_t
>
)
{
return
2
e
-
1
;
}
else
if
constexpr
(
std
::
is_same_v
<
DataType
,
ck
::
bf8_t
>
)
{
return
2
e
-
1
;
}
else
{
return
1
e
-
3
;
}
}
template
<
typename
DataType
>
inline
__host__
__device__
constexpr
double
get_atol
()
{
if
constexpr
(
std
::
is_same_v
<
DataType
,
float
>
)
{
return
1
e
-
3
;
}
else
if
constexpr
(
std
::
is_same_v
<
DataType
,
double
>
)
{
return
1
e
-
6
;
}
else
if
constexpr
(
std
::
is_same_v
<
DataType
,
ck
::
half_t
>
)
{
return
1
e
-
3
;
}
else
if
constexpr
(
std
::
is_same_v
<
DataType
,
ck
::
bhalf_t
>
)
{
return
5
e
-
2
;
}
else
if
constexpr
(
std
::
is_same_v
<
DataType
,
int32_t
>
)
{
return
1
e
-
1
;
}
else
if
constexpr
(
std
::
is_same_v
<
DataType
,
int8_t
>
)
{
return
1
e
-
1
;
}
else
if
constexpr
(
std
::
is_same_v
<
DataType
,
ck
::
f8_t
>
)
{
return
2
e
-
1
;
}
else
if
constexpr
(
std
::
is_same_v
<
DataType
,
ck
::
bf8_t
>
)
{
return
2
e
-
1
;
}
else
{
return
1
e
-
3
;
}
}
struct
ProblemSize
final
struct
ProblemSize
final
{
{
ck
::
index_t
M
=
3840
;
ck
::
index_t
M
=
3840
;
...
@@ -100,10 +18,9 @@ struct ProblemSize final
...
@@ -100,10 +18,9 @@ struct ProblemSize final
struct
ExecutionConfig
final
struct
ExecutionConfig
final
{
{
// 0 - no verification, 1 - CPU, 2 - GPU, 3 - CPU + GPU
bool
do_verification
=
true
;
int
do_verification
=
1
;
int
init_method
=
1
;
int
init_method
=
7
;
bool
time_kernel
=
false
;
bool
time_kernel
=
false
;
};
};
bool
run_splitK_gemm
(
const
ProblemSize
&
problem_size
,
const
ExecutionConfig
&
config
)
bool
run_splitK_gemm
(
const
ProblemSize
&
problem_size
,
const
ExecutionConfig
&
config
)
...
@@ -151,17 +68,9 @@ bool run_splitK_gemm(const ProblemSize& problem_size, const ExecutionConfig& con
...
@@ -151,17 +68,9 @@ bool run_splitK_gemm(const ProblemSize& problem_size, const ExecutionConfig& con
a_m_k
.
GenerateTensorValue
(
GeneratorTensor_3
<
ADataType
>
{
0.0
,
1.0
});
a_m_k
.
GenerateTensorValue
(
GeneratorTensor_3
<
ADataType
>
{
0.0
,
1.0
});
b_k_n
.
GenerateTensorValue
(
GeneratorTensor_3
<
BDataType
>
{
-
0.5
,
0.5
});
b_k_n
.
GenerateTensorValue
(
GeneratorTensor_3
<
BDataType
>
{
-
0.5
,
0.5
});
break
;
break
;
case
6
:
a_m_k
.
GenerateTensorValue
(
GeneratorTensor_PI
<
ADataType
>
{});
b_k_n
.
GenerateTensorValue
(
GeneratorTensor_1
<
BDataType
>
{
1
});
break
;
case
7
:
a_m_k
.
GenerateTensorValue
(
GeneratorTensor_PI_A
<
ADataType
>
{});
b_k_n
.
GenerateTensorValue
(
GeneratorTensor_PI_B
<
BDataType
>
{});
break
;
default
:
default
:
a_m_k
.
GenerateTensorValue
(
GeneratorTensor_
1
<
ADataType
>
{
1
});
a_m_k
.
GenerateTensorValue
(
GeneratorTensor_
Sequential
<
ADataType
,
0
>
{});
b_k_n
.
GenerateTensorValue
(
GeneratorTensor_
1
<
BDataType
>
{
1
});
b_k_n
.
GenerateTensorValue
(
GeneratorTensor_
Sequential
<
BDataType
,
1
>
{});
}
}
DeviceMem
a_m_k_device_buf
(
sizeof
(
ADataType
)
*
a_m_k
.
mDesc
.
GetElementSpaceSize
());
DeviceMem
a_m_k_device_buf
(
sizeof
(
ADataType
)
*
a_m_k
.
mDesc
.
GetElementSpaceSize
());
...
@@ -217,7 +126,7 @@ bool run_splitK_gemm(const ProblemSize& problem_size, const ExecutionConfig& con
...
@@ -217,7 +126,7 @@ bool run_splitK_gemm(const ProblemSize& problem_size, const ExecutionConfig& con
invoker
.
Run
(
argument
,
StreamConfig
{
nullptr
,
false
});
invoker
.
Run
(
argument
,
StreamConfig
{
nullptr
,
false
});
bool
pass
=
true
;
bool
pass
=
true
;
if
(
(
config
.
do_verification
==
1
)
||
(
config
.
do_verification
==
3
)
)
if
(
config
.
do_verification
)
{
{
c_m_n_device_buf
.
FromDevice
(
c_m_n_device_result
.
mData
.
data
());
c_m_n_device_buf
.
FromDevice
(
c_m_n_device_result
.
mData
.
data
());
using
ReferenceGemmInstance
=
ck
::
tensor_operation
::
host
::
ReferenceGemm
<
ADataType
,
using
ReferenceGemmInstance
=
ck
::
tensor_operation
::
host
::
ReferenceGemm
<
ADataType
,
...
@@ -236,7 +145,6 @@ bool run_splitK_gemm(const ProblemSize& problem_size, const ExecutionConfig& con
...
@@ -236,7 +145,6 @@ bool run_splitK_gemm(const ProblemSize& problem_size, const ExecutionConfig& con
auto
ref_argument
=
ref_gemm
.
MakeArgument
(
auto
ref_argument
=
ref_gemm
.
MakeArgument
(
a_m_k
,
b_k_n
,
c_m_n_host_result
,
a_element_op
,
b_element_op
,
c_element_op
);
a_m_k
,
b_k_n
,
c_m_n_host_result
,
a_element_op
,
b_element_op
,
c_element_op
);
std
::
cout
<<
"Running verification on CPU."
<<
std
::
endl
;
ref_invoker
.
Run
(
ref_argument
);
ref_invoker
.
Run
(
ref_argument
);
if
(
std
::
is_same
<
CDataType
,
ck
::
half_t
>::
value
)
if
(
std
::
is_same
<
CDataType
,
ck
::
half_t
>::
value
)
...
@@ -246,82 +154,10 @@ bool run_splitK_gemm(const ProblemSize& problem_size, const ExecutionConfig& con
...
@@ -246,82 +154,10 @@ bool run_splitK_gemm(const ProblemSize& problem_size, const ExecutionConfig& con
}
}
else
else
{
{
pass
&=
ck
::
utils
::
check_err
(
c_m_n_device_result
,
pass
&=
ck
::
utils
::
check_err
(
c_m_n_device_result
,
c_m_n_host_result
);
c_m_n_host_result
,
"Error: Incorrect results!"
,
get_rtol
<
CDataType
>
(),
get_atol
<
CDataType
>
());
}
if
(
pass
)
std
::
cout
<<
"Verification on CPU: PASS"
<<
std
::
endl
;
if
(
config
.
init_method
==
6
||
config
.
init_method
==
7
)
{
std
::
cout
<<
std
::
fixed
<<
std
::
setprecision
(
16
);
AccDataType
d
=
ck
::
type_convert
<
AccDataType
>
(
c_m_n_device_result
(
0
,
10
));
AccDataType
h
=
ck
::
type_convert
<
AccDataType
>
(
c_m_n_host_result
(
10
,
0
));
std
::
cout
<<
"device result: "
<<
d
<<
std
::
endl
;
std
::
cout
<<
"host result: "
<<
h
<<
std
::
endl
;
std
::
cout
<<
"expected result: "
<<
M_PI
<<
std
::
endl
;
std
::
cout
<<
"device - host: "
<<
std
::
abs
(
d
-
h
)
<<
std
::
endl
;
std
::
cout
<<
"device - expected: "
<<
std
::
abs
(
d
-
M_PI
)
<<
std
::
endl
;
std
::
cout
<<
"atol: "
<<
get_atol
<
CDataType
>
()
<<
std
::
endl
;
std
::
cout
<<
std
::
endl
<<
std
::
endl
;
}
}
}
}
if
((
config
.
do_verification
==
2
)
||
(
config
.
do_verification
==
3
))
{
Tensor
<
CDataType
>
c_m_n_device_ref_result
(
f_host_tensor_descriptor
(
M
,
N
,
StrideC
,
CLayout
{}));
DeviceMem
c_m_n_device_ref_buf
(
sizeof
(
CDataType
)
*
c_m_n_device_ref_result
.
mDesc
.
GetElementSpaceSize
());
// GPU verification
using
ReferenceComputeType
=
float
;
using
ReferenceGemmInstanceGPU
=
ck
::
tensor_operation
::
device
::
ReferenceGemm
<
ALayout
,
BLayout
,
CLayout
,
ADataType
,
BDataType
,
CDataType
,
AccDataType
,
AElementOp
,
BElementOp
,
CElementOp
,
ReferenceComputeType
,
ReferenceComputeType
>
;
auto
ref_gemm_gpu
=
ReferenceGemmInstanceGPU
{};
auto
ref_invoker_gpu
=
ref_gemm_gpu
.
MakeInvoker
();
auto
ref_argument_gpu
=
ref_gemm_gpu
.
MakeArgument
(
static_cast
<
ADataType
*>
(
a_m_k_device_buf
.
GetDeviceBuffer
()),
static_cast
<
BDataType
*>
(
b_k_n_device_buf
.
GetDeviceBuffer
()),
static_cast
<
CDataType
*>
(
c_m_n_device_ref_buf
.
GetDeviceBuffer
()),
M
,
N
,
K
,
a_element_op
,
b_element_op
,
c_element_op
);
std
::
cout
<<
"Running verification on GPU."
<<
std
::
endl
;
ref_invoker_gpu
.
Run
(
ref_argument_gpu
,
StreamConfig
{});
c_m_n_device_ref_buf
.
FromDevice
(
c_m_n_device_ref_result
.
mData
.
data
());
c_m_n_device_buf
.
FromDevice
(
c_m_n_device_result
.
mData
.
data
());
auto
gpu_pass
=
ck
::
utils
::
check_err
(
c_m_n_device_result
,
c_m_n_device_ref_result
,
"Error: Incorrect results!"
,
get_rtol
<
CDataType
>
(),
get_atol
<
CDataType
>
());
if
(
gpu_pass
)
std
::
cout
<<
"Verification on GPU: PASS"
<<
std
::
endl
;
pass
&=
gpu_pass
;
}
if
(
config
.
time_kernel
)
if
(
config
.
time_kernel
)
{
{
float
ave_time
=
invoker
.
Run
(
argument
,
StreamConfig
{
nullptr
,
config
.
time_kernel
,
1
});
float
ave_time
=
invoker
.
Run
(
argument
,
StreamConfig
{
nullptr
,
config
.
time_kernel
,
1
});
...
@@ -372,7 +208,7 @@ bool run_splitK_gemm_example(int argc, char* argv[])
...
@@ -372,7 +208,7 @@ bool run_splitK_gemm_example(int argc, char* argv[])
}
}
else
else
{
{
printf
(
"arg1: verification (0=no, 1=
CPU, 2=GPU, 3=CPU and GPU
)
\n
"
);
printf
(
"arg1: verification (0=no, 1=
yes
)
\n
"
);
printf
(
"arg2: initialization (0=no init, 1=integer value, 2=decimal value)
\n
"
);
printf
(
"arg2: initialization (0=no init, 1=integer value, 2=decimal value)
\n
"
);
printf
(
"arg3: time kernel (0=no, 1=yes)
\n
"
);
printf
(
"arg3: time kernel (0=no, 1=yes)
\n
"
);
printf
(
"arg4: KBatch
\n
"
);
printf
(
"arg4: KBatch
\n
"
);
...
...
example/35_splitK_gemm/splitK_gemm_xdl_bf16.cpp
View file @
3289a5c9
// SPDX-License-Identifier: MIT
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-202
4
, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-202
3
, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream>
#include <iostream>
#include <numeric>
#include <numeric>
...
@@ -16,7 +16,6 @@
...
@@ -16,7 +16,6 @@
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
#include "ck/library/reference_tensor_operation/gpu/reference_gemm.hpp"
#include "ck/library/utility/literals.hpp"
#include "ck/library/utility/literals.hpp"
template
<
ck
::
index_t
...
Is
>
template
<
ck
::
index_t
...
Is
>
...
...
example/35_splitK_gemm/splitK_gemm_xdl_fp16.cpp
View file @
3289a5c9
// SPDX-License-Identifier: MIT
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-202
4
, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-202
3
, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream>
#include <iostream>
#include <numeric>
#include <numeric>
...
@@ -16,7 +16,6 @@
...
@@ -16,7 +16,6 @@
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
#include "ck/library/reference_tensor_operation/gpu/reference_gemm.hpp"
#include "ck/library/utility/literals.hpp"
#include "ck/library/utility/literals.hpp"
template
<
ck
::
index_t
...
Is
>
template
<
ck
::
index_t
...
Is
>
...
...
Prev
1
2
3
4
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment