Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
f945f40a
Commit
f945f40a
authored
Apr 20, 2023
by
Astha Rai
Browse files
gemm example: compiles into normal executable and .so
parent
3af15212
Changes
19
Hide whitespace changes
Inline
Side-by-side
Showing
19 changed files
with
1553 additions
and
1774 deletions
+1553
-1774
python/ait_impl/generation/Makefile
python/ait_impl/generation/Makefile
+2
-2
python/ait_impl/generation/demo.py
python/ait_impl/generation/demo.py
+0
-19
python/ait_impl/generation/ex/Makefile
python/ait_impl/generation/ex/Makefile
+16
-0
python/ait_impl/generation/ex/ex.cpp
python/ait_impl/generation/ex/ex.cpp
+183
-0
python/ait_impl/generation/ex/gemm_ex.py
python/ait_impl/generation/ex/gemm_ex.py
+293
-0
python/ait_impl/generation/ex/normal/Makefile
python/ait_impl/generation/ex/normal/Makefile
+14
-0
python/ait_impl/generation/ex/normal/gemm_ex.py
python/ait_impl/generation/ex/normal/gemm_ex.py
+293
-0
python/ait_impl/generation/ex/normal/gemm_ex_code.py
python/ait_impl/generation/ex/normal/gemm_ex_code.py
+215
-0
python/ait_impl/generation/ex/normal/input.py
python/ait_impl/generation/ex/normal/input.py
+1
-0
python/ait_impl/generation/ex/potential/Makefile
python/ait_impl/generation/ex/potential/Makefile
+13
-0
python/ait_impl/generation/ex/shared/Makefile
python/ait_impl/generation/ex/shared/Makefile
+14
-0
python/ait_impl/generation/ex/shared/gemm_ex.py
python/ait_impl/generation/ex/shared/gemm_ex.py
+293
-0
python/ait_impl/generation/ex/shared/gemm_ex_code.py
python/ait_impl/generation/ex/shared/gemm_ex_code.py
+215
-0
python/ait_impl/generation/ex/shared/input.py
python/ait_impl/generation/ex/shared/input.py
+1
-0
python/ait_impl/generation/gemm_dev_op.py
python/ait_impl/generation/gemm_dev_op.py
+0
-665
python/ait_impl/generation/gemm_kernel.py
python/ait_impl/generation/gemm_kernel.py
+0
-175
python/ait_impl/generation/norm_ex.py
python/ait_impl/generation/norm_ex.py
+0
-282
python/ait_impl/generation/permute_ex.py
python/ait_impl/generation/permute_ex.py
+0
-43
python/ait_impl/generation/xx.cpp
python/ait_impl/generation/xx.cpp
+0
-588
No files found.
python/ait_impl/generation/Makefile
View file @
f945f40a
gemm
:
xx.o
CFLAGS
=
-I
~/
rocm
/composable_kernel/include
-I
/opt/rocm-5.1.1/hip/include
-I
~/
rocm
/composable_kernel/include/
-I
~/
rocm
/composable_kernel/include/ck/
-I
~/
rocm
/composable_kernel/include/ck/problem_transform/
-I
~/
rocm
/composable_kernel/include/ck/tensor/
-I
~/
rocm
/composable_kernel/include/ck/tensor_description/
-I
~/
rocm
/composable_kernel/include/ck/tensor_operation/
-I
~/
rocm
/composable_kernel/include/ck/tensor_operation/gpu/block/
-I
~/
rocm
/composable_kernel/include/ck/tensor_operation/gpu/device/
-I
~/
rocm
/composable_kernel/include/ck/tensor_operation/gpu/device/impl/
-I
~/
rocm
/composable_kernel/include/ck/tensor_operation/gpu/element/
-I
~/
rocm
/composable_kernel/include/ck/tensor_operation/gpu/grid/
-I
~/
rocm
/composable_kernel/include/ck/tensor_operation/gpu/thread/
-I
~/
rocm
/composable_kernel/include/ck/tensor_operation/gpu/warp/
-I
~/
rocm
/composable_kernel/include/ck/host_utility
-I
/external/include/half/
-I
~/
rocm
/composable_kernel/library/include/ck/library/host/
-I
~/
rocm
/composable_kernel/library/include/ck/library/host_tensor/
-I
~/
rocm
/composable_kernel/library/include/ck/library/obselete_driver_offline/
-I
~/
rocm
/composable_kernel/library/include/ck/library/reference_tensor_operation/cpu/
-I
~/
rocm
/composable_kernel/library/include/ck/library/reference_tensor_operation/gpu/
-I
~/
rocm
/composable_kernel/library/include/ck/library/tensor_operation_instance/
-I
~/
rocm
/composable_kernel/library/include/ck/library/tensor_operation_instance/gpu/
" + "
reduce/
-I
~/
rocm
/composable_kernel/library/include/ck/library/tensor_op/
-I
~/
rocm
/composable_kernel/library/include/ck/library/utility/
-I
~/
rocm
/composable_kernel/profiler/include/
CFLAGS
=
-I
~/
workspace
/composable_kernel/include
-I
/opt/
workspace/
rocm-5.1.1/hip/include
-I
~/
workspace
/composable_kernel/include/
-I
~/
workspace
/composable_kernel/include/ck/
-I
~/
workspace
/composable_kernel/include/ck/problem_transform/
-I
~/
workspace
/composable_kernel/include/ck/tensor/
-I
~/
workspace
/composable_kernel/include/ck/tensor_description/
-I
~/
workspace
/composable_kernel/include/ck/tensor_operation/
-I
~/
workspace
/composable_kernel/include/ck/tensor_operation/gpu/block/
-I
~/
workspace
/composable_kernel/include/ck/tensor_operation/gpu/device/
-I
~/
workspace
/composable_kernel/include/ck/tensor_operation/gpu/device/impl/
-I
~/
workspace
/composable_kernel/include/ck/tensor_operation/gpu/element/
-I
~/
workspace
/composable_kernel/include/ck/tensor_operation/gpu/grid/
-I
~/
workspace
/composable_kernel/include/ck/tensor_operation/gpu/thread/
-I
~/
workspace
/composable_kernel/include/ck/tensor_operation/gpu/warp/
-I
~/
workspace
/composable_kernel/include/ck/host_utility
-I
/external/include/half/
-I
~/
workspace
/composable_kernel/library/include/ck/library/host/
-I
~/
workspace
/composable_kernel/library/include/ck/library/host_tensor/
-I
~/
workspace
/composable_kernel/library/include/ck/library/obselete_driver_offline/
-I
~/
workspace
/composable_kernel/library/include/ck/library/reference_tensor_operation/cpu/
-I
~/
workspace
/composable_kernel/library/include/ck/library/reference_tensor_operation/gpu/
-I
~/
workspace
/composable_kernel/library/include/ck/library/tensor_operation_instance/
-I
~/
workspace
/composable_kernel/library/include/ck/library/tensor_operation_instance/gpu/
" + "
reduce/
-I
~/
workspace
/composable_kernel/library/include/ck/library/tensor_op/
-I
~/
workspace
/composable_kernel/library/include/ck/library/utility/
-I
~/
workspace
/composable_kernel/profiler/include/
CXXFLAGS
=
-std
=
c++17
xx.o
:
hipcc
-fPIC
-fvisibility
=
hidden
$(CXXFLAGS)
-w
/opt/rocm/amdgcn/bitcode/oclc_abi_version_400.bc
$(CFLAGS)
-L
/opt/rocm/rocrand
-lrocrand
-x
hip
-c
-o
xx.cpp
hipcc
-fPIC
-fvisibility
=
hidden
$(CXXFLAGS)
-w
/opt/rocm
-5.3.0
/amdgcn/bitcode/oclc_abi_version_400.bc
$(CFLAGS)
-L
/opt/rocm
-5.3.0
/rocrand
-lrocrand
-x
hip
-c
xx.cpp
python/ait_impl/generation/demo.py
deleted
100644 → 0
View file @
3af15212
import
jinja2
SHAPE_EVAL_TEMPLATE
=
jinja2
.
Template
(
"""
int M = *in_{{ range(rank - 1)|join(' * *in_') }};
int N = *in_{{rank - 1}};
static constexpr auto I0 = Number<0>{};
static constexpr auto I1 = Number<1>{};
static constexpr auto I2 = Number<2>{};
static constexpr auto I3 = Number<3>{};
static constexpr auto I4 = Number<4>{};
static constexpr auto I5 = Number<5>{};
static constexpr auto K1Number = Number<K1>{};
"""
)
output
=
SHAPE_EVAL_TEMPLATE
.
render
(
rank
=
2
);
print
(
output
)
\ No newline at end of file
python/ait_impl/generation/ex/Makefile
0 → 100644
View file @
f945f40a
CFLAGS
=
-I
~/workspace/composable_kernel/include
-I
/opt/workspace/rocm-5.1.1/hip/include
-I
~/workspace/composable_kernel/include/
-I
~/workspace/composable_kernel/include/ck/
-I
~/workspace/composable_kernel/example/01_gemm/
-I
~/workspace/composable_kernel/library/include/
-I
~/workspace/composable_kernel/library/src/utility/
-I
~/workspace/composable_kernel/include/ck/problem_transform/
-I
~/workspace/composable_kernel/include/ck/tensor/
-I
~/workspace/composable_kernel/include/ck/tensor_description/
-I
~/workspace/composable_kernel/include/ck/tensor_operation/
-I
~/workspace/composable_kernel/include/ck/tensor_operation/gpu/block/
-I
~/workspace/composable_kernel/include/ck/tensor_operation/gpu/device/
-I
~/workspace/composable_kernel/include/ck/tensor_operation/gpu/device/impl/
-I
~/workspace/composable_kernel/include/ck/tensor_operation/gpu/element/
-I
~/workspace/composable_kernel/include/ck/tensor_operation/gpu/grid/
-I
~/workspace/composable_kernel/include/ck/tensor_operation/gpu/thread/
-I
~/workspace/composable_kernel/include/ck/tensor_operation/gpu/warp/
-I
~/workspace/composable_kernel/include/ck/host_utility
-I
/external/include/half/
-I
~/workspace/composable_kernel/library/include/ck/library/host/
-I
~/workspace/composable_kernel/library/include/ck/library/host_tensor/
-I
~/workspace/composable_kernel/library/include/ck/library/obselete_driver_offline/
-I
~/workspace/composable_kernel/library/include/ck/library/reference_tensor_operation/cpu/
-I
~/workspace/composable_kernel/library/include/ck/library/reference_tensor_operation/gpu/
-I
~/workspace/composable_kernel/library/include/ck/library/tensor_operation_instance/
-I
~/workspace/composable_kernel/library/include/ck/library/tensor_operation_instance/gpu/
" + "
reduce/
-I
~/workspace/composable_kernel/library/include/ck/library/tensor_op/
-I
~/workspace/composable_kernel/library/include/ck/library/utility/
-I
~/workspace/composable_kernel/profiler/include/
CXXFLAGS
=
-std
=
c++17
gemm
:
ex.o host_tensor.o device_memory.o
hipcc
$(CXXFLAGS)
$(CFLAGS)
ex.o host_tensor.o device_memory.o
-o
gemm
device_memory.o
:
../../../../library/src/utility/device_memory.cpp
hipcc
$(CXXFLAGS)
$(CFLAGS)
-c
../../../../library/src/utility/device_memory.cpp
host_tensor.o
:
../../../../library/src/utility/host_tensor.cpp
hipcc
$(CXXFLAGS)
$(CFLAGS)
-c
../../../../library/src/utility/host_tensor.cpp
ex.o
:
hipcc
-fPIC
-fvisibility
=
hidden
$(CXXFLAGS)
-w
/opt/rocm-5.3.0/amdgcn/bitcode/oclc_abi_version_400.bc
$(CFLAGS)
-L
/opt/rocm-5.3.0/rocrand
-lrocrand
-x
hip
-c
ex.cpp
\ No newline at end of file
python/ait_impl/generation/ex/ex.cpp
0 → 100644
View file @
f945f40a
#pragma once
#include "common.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_gemm_dl.hpp"
using
ADataType
=
ck
::
half_t
;
using
BDataType
=
ck
::
half_t
;
using
CDataType
=
ck
::
half_t
;
using
AccDataType
=
float
;
using
ALayout
=
Col
;
using
BLayout
=
Row
;
using
CLayout
=
Row
;
using
AElementOp
=
PassThrough
;
using
BElementOp
=
PassThrough
;
using
CElementOp
=
PassThrough
;
static
constexpr
auto
GemmDefault
=
ck
::
tensor_operation
::
device
::
GemmSpecialization
::
Default
;
using
DeviceGemmInstance
=
ck
::
tensor_operation
::
device
::
DeviceGemmDl
<
ck
::
half_t
,
ck
::
half_t
,
ck
::
half_t
,
float
,
ck
::
tensor_layout
::
gemm
::
ColumnMajor
,
ck
::
tensor_layout
::
gemm
::
RowMajor
,
ck
::
tensor_layout
::
gemm
::
RowMajor
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
,
ck
::
tensor_operation
::
device
::
GemmSpecialization
::
Default
,
256
,
128
,
128
,
16
,
2
,
4
,
4
,
1
,
S
<
8
,
2
>
,
S
<
8
,
2
>
,
S
<
2
,
1
,
4
,
2
>
,
S
<
8
,
1
,
32
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
2
,
1
,
4
,
2
>
,
S
<
8
,
1
,
32
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
4
>
;
using
ReferenceGemmInstance
=
ck
::
tensor_operation
::
host
::
ReferenceGemm
<
ADataType
,
BDataType
,
CDataType
,
AccDataType
,
AElementOp
,
BElementOp
,
CElementOp
>
;
bool
run_gemm
(
const
ProblemSize
&
problem_size
,
const
ExecutionConfig
&
config
)
{
using
namespace
ck
::
literals
;
auto
&
[
M
,
N
,
K
,
StrideA
,
StrideB
,
StrideC
]
=
problem_size
;
auto
f_host_tensor_descriptor
=
[](
std
::
size_t
row
,
std
::
size_t
col
,
std
::
size_t
stride
,
auto
layout
)
{
if
constexpr
(
std
::
is_same_v
<
decltype
(
layout
),
ck
::
tensor_layout
::
gemm
::
RowMajor
>
)
{
return
HostTensorDescriptor
({
row
,
col
},
{
stride
,
1
_uz
});
}
else
{
return
HostTensorDescriptor
({
row
,
col
},
{
1
_uz
,
stride
});
}
};
Tensor
<
ck
::
half_t
>
a_m_k
(
f_host_tensor_descriptor
(
M
,
K
,
StrideA
,
ck
::
tensor_layout
::
gemm
::
ColumnMajor
{}));
Tensor
<
ck
::
half_t
>
b_k_n
(
f_host_tensor_descriptor
(
K
,
N
,
StrideB
,
ck
::
tensor_layout
::
gemm
::
RowMajor
{}));
switch
(
config
.
init_method
)
{
case
0
:
break
;
case
1
:
ck
::
utils
::
FillUniformDistributionIntegerValue
<
ck
::
half_t
>
{
-
5.
f
,
5.
f
}(
a_m_k
);
ck
::
utils
::
FillUniformDistributionIntegerValue
<
ck
::
half_t
>
{
-
5.
f
,
5.
f
}(
b_k_n
);
break
;
default:
ck
::
utils
::
FillUniformDistribution
<
ck
::
half_t
>
{
-
1.
f
,
1.
f
}(
a_m_k
);
ck
::
utils
::
FillUniformDistribution
<
ck
::
half_t
>
{
-
1.
f
,
1.
f
}(
b_k_n
);
}
Tensor
<
ck
::
half_t
>
c_m_n_host_result
(
f_host_tensor_descriptor
(
M
,
N
,
StrideC
,
CLayout
{}));
Tensor
<
ck
::
half_t
>
c_m_n_device_result
(
f_host_tensor_descriptor
(
M
,
N
,
StrideC
,
CLayout
{}));
std
::
cout
<<
"a_m_k: "
<<
a_m_k
.
mDesc
<<
std
::
endl
;
std
::
cout
<<
"b_k_n: "
<<
b_k_n
.
mDesc
<<
std
::
endl
;
std
::
cout
<<
"c_m_n: "
<<
c_m_n_host_result
.
mDesc
<<
std
::
endl
;
DeviceMem
a_m_k_device_buf
(
sizeof
(
ck
::
half_t
)
*
a_m_k
.
mDesc
.
GetElementSpaceSize
());
DeviceMem
b_k_n_device_buf
(
sizeof
(
ck
::
half_t
)
*
b_k_n
.
mDesc
.
GetElementSpaceSize
());
DeviceMem
c_m_n_device_buf
(
sizeof
(
ck
::
half_t
)
*
c_m_n_device_result
.
mDesc
.
GetElementSpaceSize
());
a_m_k_device_buf
.
ToDevice
(
a_m_k
.
mData
.
data
());
b_k_n_device_buf
.
ToDevice
(
b_k_n
.
mData
.
data
());
auto
a_element_op
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
{};
auto
b_element_op
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
{};
auto
c_element_op
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
{};
// do GEMM
auto
gemm
=
DeviceGemmInstance
{};
auto
invoker
=
gemm
.
MakeInvoker
();
auto
argument
=
gemm
.
MakeArgument
(
static_cast
<
ck
::
half_t
*>
(
a_m_k_device_buf
.
GetDeviceBuffer
()),
static_cast
<
ck
::
half_t
*>
(
b_k_n_device_buf
.
GetDeviceBuffer
()),
static_cast
<
ck
::
half_t
*>
(
c_m_n_device_buf
.
GetDeviceBuffer
()),
M
,
N
,
K
,
StrideA
,
StrideB
,
StrideC
,
a_element_op
,
b_element_op
,
c_element_op
);
if
(
!
gemm
.
IsSupportedArgument
(
argument
))
{
std
::
cerr
<<
gemm
.
GetTypeString
()
<<
" does not support this problem"
<<
std
::
endl
;
return
true
;
}
float
ave_time
=
invoker
.
Run
(
argument
,
StreamConfig
{
nullptr
,
config
.
time_kernel
});
std
::
size_t
flop
=
2
_uz
*
M
*
N
*
K
;
std
::
size_t
num_btype
=
sizeof
(
ck
::
half_t
)
*
M
*
K
+
sizeof
(
ck
::
half_t
)
*
K
*
N
+
sizeof
(
ck
::
half_t
)
*
M
*
N
;
float
tflops
=
static_cast
<
float
>
(
flop
)
/
1.E9
/
ave_time
;
float
gb_per_sec
=
num_btype
/
1.E6
/
ave_time
;
std
::
cout
<<
"Perf: "
<<
ave_time
<<
" ms, "
<<
tflops
<<
" TFlops, "
<<
gb_per_sec
<<
" GB/s, "
<<
gemm
.
GetTypeString
()
<<
std
::
endl
;
if
(
config
.
do_verification
)
{
auto
ref_gemm
=
ReferenceGemmInstance
{};
auto
ref_invoker
=
ref_gemm
.
MakeInvoker
();
auto
ref_argument
=
ref_gemm
.
MakeArgument
(
a_m_k
,
b_k_n
,
c_m_n_host_result
,
a_element_op
,
b_element_op
,
c_element_op
);
ref_invoker
.
Run
(
ref_argument
);
c_m_n_device_buf
.
FromDevice
(
c_m_n_device_result
.
mData
.
data
());
return
ck
::
utils
::
check_err
(
c_m_n_device_result
,
c_m_n_host_result
);
}
return
true
;
}
bool
run_gemm_example
(
int
argc
,
char
*
argv
[])
{
ProblemSize
problem_size
;
ExecutionConfig
config
;
return
!
parse_cmd_args
(
argc
,
argv
,
problem_size
,
config
)
||
run_gemm
(
problem_size
,
config
);
}
int
main
(
int
argc
,
char
*
argv
[])
{
return
!
run_gemm_example
(
argc
,
argv
);
}
python/ait_impl/generation/ex/gemm_ex.py
0 → 100644
View file @
f945f40a
import
enum
import
os.path
import
shutil
import
functools
import
operator
import
collections
import
subprocess
import
re
def
SubstituteTemplate
(
template
,
values
):
text
=
template
changed
=
True
while
changed
:
changed
=
False
for
key
,
value
in
values
.
items
():
regex
=
"
\\
$
\\
{%s
\\
}"
%
key
newtext
=
re
.
sub
(
regex
,
value
,
text
)
if
newtext
!=
text
:
changed
=
True
text
=
newtext
return
text
class
EmitGemmInstance
:
def
__init__
(
self
):
self
.
make_template
=
"""
CFLAGS=-I ~/workspace/composable_kernel/include -I /opt/workspace/rocm-5.1.1/hip/include -I ~/workspace/composable_kernel/include/ -I ~/workspace/composable_kernel/include/ck/ -I ~/workspace/composable_kernel/example/01_gemm/ -I ~/workspace/composable_kernel/library/include/ -I ~/workspace/composable_kernel/library/src/utility/ -I ~/workspace/composable_kernel/include/ck/problem_transform/ -I ~/workspace/composable_kernel/include/ck/tensor/ -I ~/workspace/composable_kernel/include/ck/tensor_description/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/block/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/device/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/device/impl/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/element/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/grid/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/thread/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/warp/ -I ~/workspace/composable_kernel/include/ck/host_utility -I /external/include/half/ -I ~/workspace/composable_kernel/library/include/ck/library/host/ -I ~/workspace/composable_kernel/library/include/ck/library/host_tensor/ -I ~/workspace/composable_kernel/library/include/ck/library/obselete_driver_offline/ -I ~/workspace/composable_kernel/library/include/ck/library/reference_tensor_operation/cpu/ -I ~/workspace/composable_kernel/library/include/ck/library/reference_tensor_operation/gpu/ -I ~/workspace/composable_kernel/library/include/ck/library/tensor_operation_instance/ -I ~/workspace/composable_kernel/library/include/ck/library/tensor_operation_instance/gpu/" + "reduce/ -I ~/workspace/composable_kernel/library/include/ck/library/tensor_op/ -I ~/workspace/composable_kernel/library/include/ck/library/utility/ -I ~/workspace/composable_kernel/profiler/include/
CXXFLAGS = -std=c++17
gemm: ex.o host_tensor.o device_memory.o
hipcc $(CXXFLAGS) $(CFLAGS) ex.o host_tensor.o device_memory.o -o gemm
device_memory.o: ../../../../library/src/utility/device_memory.cpp
hipcc $(CXXFLAGS) $(CFLAGS) -c ../../../../library/src/utility/device_memory.cpp
host_tensor.o: ../../../../library/src/utility/host_tensor.cpp
hipcc $(CXXFLAGS) $(CFLAGS) -c ../../../../library/src/utility/host_tensor.cpp
ex.o:
hipcc -fPIC -fvisibility=hidden $(CXXFLAGS) -w /opt/rocm-5.3.0/amdgcn/bitcode/oclc_abi_version_400.bc $(CFLAGS) -L/opt/rocm-5.3.0/rocrand -lrocrand -x hip -c ex.cpp
"""
self
.
gemm_devop_template
=
"""
#pragma once
#include "common.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_gemm_dl.hpp"
using ADataType = ck::half_t;
using BDataType = ck::half_t;
using CDataType = ck::half_t;
using AccDataType = float;
using ALayout = Col;
using BLayout = Row;
using CLayout = Row;
using AElementOp = PassThrough;
using BElementOp = PassThrough;
using CElementOp = PassThrough;
static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmDl<
${type_a},
${type_b},
${type_c},
${type_acc},
${layout_a},
${layout_b},
${layout_c},
${elementwise_op_a},
${elementwise_op_b},
${elementwise_op_c},
${Gemm_spec},
${block_size},
${mperblock},
${nperblock},
${k0perblock},
${k1},
${m1perthread},
${n1perthread},
${kperthread},
${m1n1_thcluster_m1xs},
${m1n1_thcluster_n1xs},
${ABT_thread_slice_lengths_K0_M0_M1_K1},
${ABT_thread_cluster_lengths_K0_M0_M1_K1},
${ABT_thread_cluster_arrange_order},
${ABT_src_access_order},
${ABT_src_vec_tensor_lengths_K0_M0_M1_K1},
${ABT_src_vec_tensor_cont_dim_order},
${ABT_dst_vec_tensor_lengths_K0_M0_M1_K1},
${BBT_thread_slice_lengths_K0_N0_N1_K1},
${BBT_thread_cluster_lengths_K0_N0_N1_K1},
${BBT_thread_cluster_arrange_order},
${BBT_src_access_order},
${BBT_src_vec_tensor_lengths_K0_N0_N1_K1},
${BBT_src_vec_tensor_cont_dim_order},
${BBT_dst_vec_tensor_lengths_K0_N0_N1_K1},
${CTT_src_dst_access_order},
${CTT_src_dst_vec_dim},
${CTT_dst_scalar_per_vector}>;
using ReferenceGemmInstance = ck::tensor_operation::host::
ReferenceGemm<ADataType, BDataType, CDataType, AccDataType, AElementOp, BElementOp, CElementOp>;
bool run_gemm(const ProblemSize& problem_size, const ExecutionConfig& config)
{
using namespace ck::literals;
auto& [M, N, K, StrideA, StrideB, StrideC] = problem_size;
auto f_host_tensor_descriptor =
[](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
if constexpr(std::is_same_v<decltype(layout), ck::tensor_layout::gemm::RowMajor>)
{
return HostTensorDescriptor({row, col}, {stride, 1_uz});
}
else
{
return HostTensorDescriptor({row, col}, {1_uz, stride});
}
};
Tensor<${type_a}> a_m_k(f_host_tensor_descriptor(M, K, StrideA, ${layout_a}{}));
Tensor<${type_b}> b_k_n(f_host_tensor_descriptor(K, N, StrideB, ${layout_b}{}));
switch(config.init_method)
{
case 0: break;
case 1:
ck::utils::FillUniformDistributionIntegerValue<${type_a}>{-5.f, 5.f}(a_m_k);
ck::utils::FillUniformDistributionIntegerValue<${type_b}>{-5.f, 5.f}(b_k_n);
break;
default:
ck::utils::FillUniformDistribution<${type_a}>{-1.f, 1.f}(a_m_k);
ck::utils::FillUniformDistribution<${type_b}>{-1.f, 1.f}(b_k_n);
}
Tensor<${type_c}> c_m_n_host_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
Tensor<${type_c}> c_m_n_device_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
std::cout << "a_m_k: " << a_m_k.mDesc << std::endl;
std::cout << "b_k_n: " << b_k_n.mDesc << std::endl;
std::cout << "c_m_n: " << c_m_n_host_result.mDesc << std::endl;
DeviceMem a_m_k_device_buf(sizeof(${type_a}) * a_m_k.mDesc.GetElementSpaceSize());
DeviceMem b_k_n_device_buf(sizeof(${type_b}) * b_k_n.mDesc.GetElementSpaceSize());
DeviceMem c_m_n_device_buf(sizeof(${type_c}) * c_m_n_device_result.mDesc.GetElementSpaceSize());
a_m_k_device_buf.ToDevice(a_m_k.mData.data());
b_k_n_device_buf.ToDevice(b_k_n.mData.data());
auto a_element_op = ${elementwise_op_a}{};
auto b_element_op = ${elementwise_op_b}{};
auto c_element_op = ${elementwise_op_c}{};
// do GEMM
auto gemm = DeviceGemmInstance{};
auto invoker = gemm.MakeInvoker();
auto argument = gemm.MakeArgument(
static_cast<${type_a}*>(a_m_k_device_buf.GetDeviceBuffer()),
static_cast<${type_b}*>(b_k_n_device_buf.GetDeviceBuffer()),
static_cast<${type_c}*>(c_m_n_device_buf.GetDeviceBuffer()),
M,
N,
K,
StrideA,
StrideB,
StrideC,
a_element_op,
b_element_op,
c_element_op);
if(!gemm.IsSupportedArgument(argument))
{
std::cerr << gemm.GetTypeString() << " does not support this problem" << std::endl;
return true;
}
float ave_time = invoker.Run(argument, StreamConfig{nullptr, config.time_kernel});
std::size_t flop = 2_uz * M * N * K;
std::size_t num_btype =
sizeof(${type_a}) * M * K + sizeof(${type_b}) * K * N + sizeof(${type_c}) * M * N;
float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
float gb_per_sec = num_btype / 1.E6 / ave_time;
std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
<< gemm.GetTypeString() << std::endl;
if(config.do_verification)
{
auto ref_gemm = ReferenceGemmInstance{};
auto ref_invoker = ref_gemm.MakeInvoker();
auto ref_argument = ref_gemm.MakeArgument(
a_m_k, b_k_n, c_m_n_host_result, a_element_op, b_element_op, c_element_op);
ref_invoker.Run(ref_argument);
c_m_n_device_buf.FromDevice(c_m_n_device_result.mData.data());
return ck::utils::check_err(c_m_n_device_result, c_m_n_host_result);
}
return true;
}
bool run_gemm_example(int argc, char* argv[])
{
ProblemSize problem_size;
ExecutionConfig config;
return !parse_cmd_args(argc, argv, problem_size, config) || run_gemm(problem_size, config);
}
int main(int argc, char* argv[]) { return !run_gemm_example(argc, argv); }
"""
def
emit
(
self
):
values
=
{
'type_a'
:
'ck::half_t'
,
'type_b'
:
'ck::half_t'
,
'type_c'
:
'ck::half_t'
,
'type_acc'
:
'float'
,
'layout_a'
:
'ck::tensor_layout::gemm::ColumnMajor'
,
'layout_b'
:
'ck::tensor_layout::gemm::RowMajor'
,
'layout_c'
:
'ck::tensor_layout::gemm::RowMajor'
,
'elementwise_op_a'
:
'ck::tensor_operation::element_wise::PassThrough'
,
'elementwise_op_b'
:
'ck::tensor_operation::element_wise::PassThrough'
,
'elementwise_op_c'
:
'ck::tensor_operation::element_wise::PassThrough'
,
'Gemm_spec'
:
'ck::tensor_operation::device::GemmSpecialization::Default'
,
'block_size'
:
'256'
,
'mperblock'
:
'128'
,
'nperblock'
:
'128'
,
'k0perblock'
:
'16'
,
'k1'
:
'2'
,
'm1perthread'
:
'4'
,
'n1perthread'
:
'4'
,
'kperthread'
:
'1'
,
'm1n1_thcluster_m1xs'
:
'S<8, 2>'
,
'm1n1_thcluster_n1xs'
:
'S<8, 2>'
,
'ABT_thread_slice_lengths_K0_M0_M1_K1'
:
'S<2, 1, 4, 2>'
,
'ABT_thread_cluster_lengths_K0_M0_M1_K1'
:
'S<8, 1, 32, 1>'
,
'ABT_thread_cluster_arrange_order'
:
'S<0, 3, 1, 2>'
,
'ABT_src_access_order'
:
'S<0, 3, 1, 2>'
,
'ABT_src_vec_tensor_lengths_K0_M0_M1_K1'
:
'S<1, 1, 4, 1>'
,
'ABT_src_vec_tensor_cont_dim_order'
:
'S<0, 3, 1, 2>'
,
'ABT_dst_vec_tensor_lengths_K0_M0_M1_K1'
:
'S<1, 1, 4, 2>'
,
'BBT_thread_slice_lengths_K0_N0_N1_K1'
:
'S<2, 1, 4, 2>'
,
'BBT_thread_cluster_lengths_K0_N0_N1_K1'
:
'S<8, 1, 32, 1>'
,
'BBT_thread_cluster_arrange_order'
:
'S<0, 3, 1, 2>'
,
'BBT_src_access_order'
:
'S<0, 3, 1, 2>'
,
'BBT_src_vec_tensor_lengths_K0_N0_N1_K1'
:
'S<1, 1, 4, 1>'
,
'BBT_src_vec_tensor_cont_dim_order'
:
'S<0, 3, 1, 2>'
,
'BBT_dst_vec_tensor_lengths_K0_N0_N1_K1'
:
'S<1, 1, 4, 2>'
,
'CTT_src_dst_access_order'
:
'S<0, 1, 2, 3, 4, 5>'
,
'CTT_src_dst_vec_dim'
:
'5'
,
'CTT_dst_scalar_per_vector'
:
'4'
}
template
=
self
.
gemm_devop_template
cf
=
open
(
"ex.cpp"
,
'w'
)
print
(
SubstituteTemplate
(
template
,
values
))
cf
.
write
(
SubstituteTemplate
(
template
,
values
))
cf
.
close
()
m_template
=
self
.
make_template
cf
=
open
(
"Makefile"
,
'w'
)
print
(
SubstituteTemplate
(
m_template
,
values
))
cf
.
write
(
SubstituteTemplate
(
m_template
,
values
))
cf
.
close
()
PIPE
=
-
1
STDOUT
=
-
2
proc
=
subprocess
.
Popen
(
[
"make"
],
shell
=
True
,
env
=
os
.
environ
.
copy
(),
stdout
=
subprocess
.
PIPE
,
stderr
=
subprocess
.
PIPE
,
)
out
,
err
=
proc
.
communicate
()
a
=
EmitGemmInstance
()
a
.
emit
()
python/ait_impl/generation/ex/normal/Makefile
0 → 100644
View file @
f945f40a
CFLAGS
=
-I
~/workspace/composable_kernel/include
-I
/opt/workspace/rocm-5.1.1/hip/include
-I
~/workspace/composable_kernel/include/
-I
~/workspace/composable_kernel/include/ck/
-I
~/workspace/composable_kernel/example/01_gemm/
-I
~/workspace/composable_kernel/library/include/
-I
~/workspace/composable_kernel/library/src/utility/
-I
~/workspace/composable_kernel/include/ck/problem_transform/
-I
~/workspace/composable_kernel/include/ck/tensor/
-I
~/workspace/composable_kernel/include/ck/tensor_description/
-I
~/workspace/composable_kernel/include/ck/tensor_operation/
-I
~/workspace/composable_kernel/include/ck/tensor_operation/gpu/block/
-I
~/workspace/composable_kernel/include/ck/tensor_operation/gpu/device/
-I
~/workspace/composable_kernel/include/ck/tensor_operation/gpu/device/impl/
-I
~/workspace/composable_kernel/include/ck/tensor_operation/gpu/element/
-I
~/workspace/composable_kernel/include/ck/tensor_operation/gpu/grid/
-I
~/workspace/composable_kernel/include/ck/tensor_operation/gpu/thread/
-I
~/workspace/composable_kernel/include/ck/tensor_operation/gpu/warp/
-I
~/workspace/composable_kernel/include/ck/host_utility
-I
/external/include/half/
-I
~/workspace/composable_kernel/library/include/ck/library/host/
-I
~/workspace/composable_kernel/library/include/ck/library/host_tensor/
-I
~/workspace/composable_kernel/library/include/ck/library/obselete_driver_offline/
-I
~/workspace/composable_kernel/library/include/ck/library/reference_tensor_operation/cpu/
-I
~/workspace/composable_kernel/library/include/ck/library/reference_tensor_operation/gpu/
-I
~/workspace/composable_kernel/library/include/ck/library/tensor_operation_instance/
-I
~/workspace/composable_kernel/library/include/ck/library/tensor_operation_instance/gpu/
" + "
reduce/
-I
~/workspace/composable_kernel/library/include/ck/library/tensor_op/
-I
~/workspace/composable_kernel/library/include/ck/library/utility/
-I
~/workspace/composable_kernel/profiler/include/
CXXFLAGS
=
-std
=
c++17
gemm
:
ex.o host_tensor.o device_memory.o
hipcc
$(CXXFLAGS)
$(CFLAGS)
ex.o host_tensor.o device_memory.o
-o
gemm
device_memory.o
:
../../../../../library/src/utility/device_memory.cpp
hipcc
$(CXXFLAGS)
$(CFLAGS)
-c
../../../../../library/src/utility/device_memory.cpp
host_tensor.o
:
../../../../../library/src/utility/host_tensor.cpp
hipcc
$(CXXFLAGS)
$(CFLAGS)
-c
../../../../../library/src/utility/host_tensor.cpp
ex.o
:
hipcc
-fPIC
-fvisibility
=
hidden
$(CXXFLAGS)
-w
/opt/rocm-5.3.0/amdgcn/bitcode/oclc_abi_version_400.bc
$(CFLAGS)
-L
/opt/rocm-5.3.0/rocrand
-lrocrand
-x
hip
-c
ex.cpp
python/ait_impl/generation/ex/normal/gemm_ex.py
0 → 100644
View file @
f945f40a
import
enum
import
os.path
import
shutil
import
functools
import
operator
import
collections
import
subprocess
import
re
def
SubstituteTemplate
(
template
,
values
):
text
=
template
changed
=
True
while
changed
:
changed
=
False
for
key
,
value
in
values
.
items
():
regex
=
"
\\
$
\\
{%s
\\
}"
%
key
newtext
=
re
.
sub
(
regex
,
value
,
text
)
if
newtext
!=
text
:
changed
=
True
text
=
newtext
return
text
class
EmitGemmInstance
:
def
__init__
(
self
):
self
.
make_template
=
"""
CFLAGS=-I ~/workspace/composable_kernel/include -I /opt/workspace/rocm-5.1.1/hip/include -I ~/workspace/composable_kernel/include/ -I ~/workspace/composable_kernel/include/ck/ -I ~/workspace/composable_kernel/example/01_gemm/ -I ~/workspace/composable_kernel/library/include/ -I ~/workspace/composable_kernel/library/src/utility/ -I ~/workspace/composable_kernel/include/ck/problem_transform/ -I ~/workspace/composable_kernel/include/ck/tensor/ -I ~/workspace/composable_kernel/include/ck/tensor_description/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/block/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/device/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/device/impl/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/element/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/grid/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/thread/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/warp/ -I ~/workspace/composable_kernel/include/ck/host_utility -I /external/include/half/ -I ~/workspace/composable_kernel/library/include/ck/library/host/ -I ~/workspace/composable_kernel/library/include/ck/library/host_tensor/ -I ~/workspace/composable_kernel/library/include/ck/library/obselete_driver_offline/ -I ~/workspace/composable_kernel/library/include/ck/library/reference_tensor_operation/cpu/ -I ~/workspace/composable_kernel/library/include/ck/library/reference_tensor_operation/gpu/ -I ~/workspace/composable_kernel/library/include/ck/library/tensor_operation_instance/ -I ~/workspace/composable_kernel/library/include/ck/library/tensor_operation_instance/gpu/" + "reduce/ -I ~/workspace/composable_kernel/library/include/ck/library/tensor_op/ -I ~/workspace/composable_kernel/library/include/ck/library/utility/ -I ~/workspace/composable_kernel/profiler/include/
CXXFLAGS = -std=c++17
gemm: ex.o host_tensor.o device_memory.o
hipcc $(CXXFLAGS) $(CFLAGS) ex.o host_tensor.o device_memory.o -o gemm
device_memory.o: ../../../../library/src/utility/device_memory.cpp
hipcc $(CXXFLAGS) $(CFLAGS) -c ../../../../library/src/utility/device_memory.cpp
host_tensor.o: ../../../../library/src/utility/host_tensor.cpp
hipcc $(CXXFLAGS) $(CFLAGS) -c ../../../../library/src/utility/host_tensor.cpp
ex.o:
hipcc -fPIC -fvisibility=hidden $(CXXFLAGS) -w /opt/rocm-5.3.0/amdgcn/bitcode/oclc_abi_version_400.bc $(CFLAGS) -L/opt/rocm-5.3.0/rocrand -lrocrand -x hip -c ex.cpp
"""
self
.
gemm_devop_template
=
"""
#pragma once
#include "common.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_gemm_dl.hpp"
using ADataType = ck::half_t;
using BDataType = ck::half_t;
using CDataType = ck::half_t;
using AccDataType = float;
using ALayout = Col;
using BLayout = Row;
using CLayout = Row;
using AElementOp = PassThrough;
using BElementOp = PassThrough;
using CElementOp = PassThrough;
static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmDl<
${type_a},
${type_b},
${type_c},
${type_acc},
${layout_a},
${layout_b},
${layout_c},
${elementwise_op_a},
${elementwise_op_b},
${elementwise_op_c},
${Gemm_spec},
${block_size},
${mperblock},
${nperblock},
${k0perblock},
${k1},
${m1perthread},
${n1perthread},
${kperthread},
${m1n1_thcluster_m1xs},
${m1n1_thcluster_n1xs},
${ABT_thread_slice_lengths_K0_M0_M1_K1},
${ABT_thread_cluster_lengths_K0_M0_M1_K1},
${ABT_thread_cluster_arrange_order},
${ABT_src_access_order},
${ABT_src_vec_tensor_lengths_K0_M0_M1_K1},
${ABT_src_vec_tensor_cont_dim_order},
${ABT_dst_vec_tensor_lengths_K0_M0_M1_K1},
${BBT_thread_slice_lengths_K0_N0_N1_K1},
${BBT_thread_cluster_lengths_K0_N0_N1_K1},
${BBT_thread_cluster_arrange_order},
${BBT_src_access_order},
${BBT_src_vec_tensor_lengths_K0_N0_N1_K1},
${BBT_src_vec_tensor_cont_dim_order},
${BBT_dst_vec_tensor_lengths_K0_N0_N1_K1},
${CTT_src_dst_access_order},
${CTT_src_dst_vec_dim},
${CTT_dst_scalar_per_vector}>;
using ReferenceGemmInstance = ck::tensor_operation::host::
ReferenceGemm<ADataType, BDataType, CDataType, AccDataType, AElementOp, BElementOp, CElementOp>;
bool run_gemm(const ProblemSize& problem_size, const ExecutionConfig& config)
{
using namespace ck::literals;
auto& [M, N, K, StrideA, StrideB, StrideC] = problem_size;
auto f_host_tensor_descriptor =
[](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
if constexpr(std::is_same_v<decltype(layout), ck::tensor_layout::gemm::RowMajor>)
{
return HostTensorDescriptor({row, col}, {stride, 1_uz});
}
else
{
return HostTensorDescriptor({row, col}, {1_uz, stride});
}
};
Tensor<${type_a}> a_m_k(f_host_tensor_descriptor(M, K, StrideA, ${layout_a}{}));
Tensor<${type_b}> b_k_n(f_host_tensor_descriptor(K, N, StrideB, ${layout_b}{}));
switch(config.init_method)
{
case 0: break;
case 1:
ck::utils::FillUniformDistributionIntegerValue<${type_a}>{-5.f, 5.f}(a_m_k);
ck::utils::FillUniformDistributionIntegerValue<${type_b}>{-5.f, 5.f}(b_k_n);
break;
default:
ck::utils::FillUniformDistribution<${type_a}>{-1.f, 1.f}(a_m_k);
ck::utils::FillUniformDistribution<${type_b}>{-1.f, 1.f}(b_k_n);
}
Tensor<${type_c}> c_m_n_host_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
Tensor<${type_c}> c_m_n_device_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
std::cout << "a_m_k: " << a_m_k.mDesc << std::endl;
std::cout << "b_k_n: " << b_k_n.mDesc << std::endl;
std::cout << "c_m_n: " << c_m_n_host_result.mDesc << std::endl;
DeviceMem a_m_k_device_buf(sizeof(${type_a}) * a_m_k.mDesc.GetElementSpaceSize());
DeviceMem b_k_n_device_buf(sizeof(${type_b}) * b_k_n.mDesc.GetElementSpaceSize());
DeviceMem c_m_n_device_buf(sizeof(${type_c}) * c_m_n_device_result.mDesc.GetElementSpaceSize());
a_m_k_device_buf.ToDevice(a_m_k.mData.data());
b_k_n_device_buf.ToDevice(b_k_n.mData.data());
auto a_element_op = ${elementwise_op_a}{};
auto b_element_op = ${elementwise_op_b}{};
auto c_element_op = ${elementwise_op_c}{};
// do GEMM
auto gemm = DeviceGemmInstance{};
auto invoker = gemm.MakeInvoker();
auto argument = gemm.MakeArgument(
static_cast<${type_a}*>(a_m_k_device_buf.GetDeviceBuffer()),
static_cast<${type_b}*>(b_k_n_device_buf.GetDeviceBuffer()),
static_cast<${type_c}*>(c_m_n_device_buf.GetDeviceBuffer()),
M,
N,
K,
StrideA,
StrideB,
StrideC,
a_element_op,
b_element_op,
c_element_op);
if(!gemm.IsSupportedArgument(argument))
{
std::cerr << gemm.GetTypeString() << " does not support this problem" << std::endl;
return true;
}
float ave_time = invoker.Run(argument, StreamConfig{nullptr, config.time_kernel});
std::size_t flop = 2_uz * M * N * K;
std::size_t num_btype =
sizeof(${type_a}) * M * K + sizeof(${type_b}) * K * N + sizeof(${type_c}) * M * N;
float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
float gb_per_sec = num_btype / 1.E6 / ave_time;
std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
<< gemm.GetTypeString() << std::endl;
if(config.do_verification)
{
auto ref_gemm = ReferenceGemmInstance{};
auto ref_invoker = ref_gemm.MakeInvoker();
auto ref_argument = ref_gemm.MakeArgument(
a_m_k, b_k_n, c_m_n_host_result, a_element_op, b_element_op, c_element_op);
ref_invoker.Run(ref_argument);
c_m_n_device_buf.FromDevice(c_m_n_device_result.mData.data());
return ck::utils::check_err(c_m_n_device_result, c_m_n_host_result);
}
return true;
}
bool run_gemm_example(int argc, char* argv[])
{
ProblemSize problem_size;
ExecutionConfig config;
return !parse_cmd_args(argc, argv, problem_size, config) || run_gemm(problem_size, config);
}
int main(int argc, char* argv[]) { return !run_gemm_example(argc, argv); }
"""
def
emit
(
self
):
values
=
{
'type_a'
:
'ck::half_t'
,
'type_b'
:
'ck::half_t'
,
'type_c'
:
'ck::half_t'
,
'type_acc'
:
'float'
,
'layout_a'
:
'ck::tensor_layout::gemm::ColumnMajor'
,
'layout_b'
:
'ck::tensor_layout::gemm::RowMajor'
,
'layout_c'
:
'ck::tensor_layout::gemm::RowMajor'
,
'elementwise_op_a'
:
'ck::tensor_operation::element_wise::PassThrough'
,
'elementwise_op_b'
:
'ck::tensor_operation::element_wise::PassThrough'
,
'elementwise_op_c'
:
'ck::tensor_operation::element_wise::PassThrough'
,
'Gemm_spec'
:
'ck::tensor_operation::device::GemmSpecialization::Default'
,
'block_size'
:
'256'
,
'mperblock'
:
'128'
,
'nperblock'
:
'128'
,
'k0perblock'
:
'16'
,
'k1'
:
'2'
,
'm1perthread'
:
'4'
,
'n1perthread'
:
'4'
,
'kperthread'
:
'1'
,
'm1n1_thcluster_m1xs'
:
'S<8, 2>'
,
'm1n1_thcluster_n1xs'
:
'S<8, 2>'
,
'ABT_thread_slice_lengths_K0_M0_M1_K1'
:
'S<2, 1, 4, 2>'
,
'ABT_thread_cluster_lengths_K0_M0_M1_K1'
:
'S<8, 1, 32, 1>'
,
'ABT_thread_cluster_arrange_order'
:
'S<0, 3, 1, 2>'
,
'ABT_src_access_order'
:
'S<0, 3, 1, 2>'
,
'ABT_src_vec_tensor_lengths_K0_M0_M1_K1'
:
'S<1, 1, 4, 1>'
,
'ABT_src_vec_tensor_cont_dim_order'
:
'S<0, 3, 1, 2>'
,
'ABT_dst_vec_tensor_lengths_K0_M0_M1_K1'
:
'S<1, 1, 4, 2>'
,
'BBT_thread_slice_lengths_K0_N0_N1_K1'
:
'S<2, 1, 4, 2>'
,
'BBT_thread_cluster_lengths_K0_N0_N1_K1'
:
'S<8, 1, 32, 1>'
,
'BBT_thread_cluster_arrange_order'
:
'S<0, 3, 1, 2>'
,
'BBT_src_access_order'
:
'S<0, 3, 1, 2>'
,
'BBT_src_vec_tensor_lengths_K0_N0_N1_K1'
:
'S<1, 1, 4, 1>'
,
'BBT_src_vec_tensor_cont_dim_order'
:
'S<0, 3, 1, 2>'
,
'BBT_dst_vec_tensor_lengths_K0_N0_N1_K1'
:
'S<1, 1, 4, 2>'
,
'CTT_src_dst_access_order'
:
'S<0, 1, 2, 3, 4, 5>'
,
'CTT_src_dst_vec_dim'
:
'5'
,
'CTT_dst_scalar_per_vector'
:
'4'
}
template
=
self
.
gemm_devop_template
cf
=
open
(
"ex.cpp"
,
'w'
)
print
(
SubstituteTemplate
(
template
,
values
))
cf
.
write
(
SubstituteTemplate
(
template
,
values
))
cf
.
close
()
m_template
=
self
.
make_template
cf
=
open
(
"Makefile"
,
'w'
)
print
(
SubstituteTemplate
(
m_template
,
values
))
cf
.
write
(
SubstituteTemplate
(
m_template
,
values
))
cf
.
close
()
PIPE
=
-
1
STDOUT
=
-
2
proc
=
subprocess
.
Popen
(
[
"make"
],
shell
=
True
,
env
=
os
.
environ
.
copy
(),
stdout
=
subprocess
.
PIPE
,
stderr
=
subprocess
.
PIPE
,
)
out
,
err
=
proc
.
communicate
()
a
=
EmitGemmInstance
()
a
.
emit
()
python/ait_impl/generation/ex/normal/gemm_ex_code.py
0 → 100644
View file @
f945f40a
import
enum
import
os.path
import
shutil
import
functools
import
operator
import
collections
import
re
def
SubstituteTemplate
(
template
,
values
):
text
=
template
changed
=
True
while
changed
:
changed
=
False
for
key
,
value
in
values
.
items
():
regex
=
"
\\
$
\\
{%s
\\
}"
%
key
newtext
=
re
.
sub
(
regex
,
value
,
text
)
if
newtext
!=
text
:
changed
=
True
text
=
newtext
return
text
class
EmitGemmInstance
:
def
__init__
(
self
):
self
.
gemm_devop_template
=
"""
#pragma once
#include "common.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_gemm_dl.hpp"
using ADataType = ck::half_t;
using BDataType = ck::half_t;
using CDataType = ck::half_t;
using AccDataType = float;
using ALayout = Col;
using BLayout = Row;
using CLayout = Row;
using AElementOp = PassThrough;
using BElementOp = PassThrough;
using CElementOp = PassThrough;
static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmDl
<ADataType,
BDataType,
CDataType,
AccDataType,
ALayout,
BLayout,
CLayout,
AElementOp,
BElementOp,
CElementOp,
GemmDefault,
256,
128,
128,
16,
2,
4,
4,
1,
S<8, 2>,
S<8, 2>,
S<2, 1, 4, 2>,
S<8, 1, 32, 1>,
S<0, 3, 1, 2>,
S<0, 3, 1, 2>,
S<1, 1, 4, 1>,
S<0, 3, 1, 2>,
S<1, 1, 4, 2>,
S<2, 1, 4, 2>,
S<8, 1, 32, 1>,
S<0, 3, 1, 2>,
S<0, 3, 1, 2>,
S<1, 1, 4, 1>,
S<0, 3, 1, 2>,
S<1, 1, 4, 2>,
S<0, 1, 2, 3, 4, 5>,
5,
4>;
bool run_gemm(const ProblemSize& problem_size, const ExecutionConfig& config)
{
using namespace ck::literals;
auto& [M, N, K, StrideA, StrideB, StrideC] = problem_size;
auto f_host_tensor_descriptor =
[](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
if constexpr(std::is_same_v<decltype(layout), ck::tensor_layout::gemm::RowMajor>)
{
return HostTensorDescriptor({row, col}, {stride, 1_uz});
}
else
{
return HostTensorDescriptor({row, col}, {1_uz, stride});
}
};
Tensor<ADataType> a_m_k(f_host_tensor_descriptor(M, K, StrideA, ALayout{}));
Tensor<BDataType> b_k_n(f_host_tensor_descriptor(K, N, StrideB, BLayout{}));
switch(config.init_method)
{
case 0: break;
case 1:
ck::utils::FillUniformDistributionIntegerValue<ADataType>{-5.f, 5.f}(a_m_k);
ck::utils::FillUniformDistributionIntegerValue<BDataType>{-5.f, 5.f}(b_k_n);
break;
default:
ck::utils::FillUniformDistribution<ADataType>{-1.f, 1.f}(a_m_k);
ck::utils::FillUniformDistribution<BDataType>{-1.f, 1.f}(b_k_n);
}
Tensor<CDataType> c_m_n_host_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
Tensor<CDataType> c_m_n_device_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
std::cout << "a_m_k: " << a_m_k.mDesc << std::endl;
std::cout << "b_k_n: " << b_k_n.mDesc << std::endl;
std::cout << "c_m_n: " << c_m_n_host_result.mDesc << std::endl;
DeviceMem a_m_k_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpaceSize());
DeviceMem b_k_n_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpaceSize());
DeviceMem c_m_n_device_buf(sizeof(CDataType) * c_m_n_device_result.mDesc.GetElementSpaceSize());
a_m_k_device_buf.ToDevice(a_m_k.mData.data());
b_k_n_device_buf.ToDevice(b_k_n.mData.data());
auto a_element_op = AElementOp{};
auto b_element_op = BElementOp{};
auto c_element_op = CElementOp{};
// do GEMM
auto gemm = DeviceGemmInstance{};
auto invoker = gemm.MakeInvoker();
auto argument = gemm.MakeArgument(
static_cast<ADataType*>(a_m_k_device_buf.GetDeviceBuffer()),
static_cast<BDataType*>(b_k_n_device_buf.GetDeviceBuffer()),
static_cast<CDataType*>(c_m_n_device_buf.GetDeviceBuffer()),
M,
N,
K,
StrideA,
StrideB,
StrideC,
a_element_op,
b_element_op,
c_element_op);
if(!gemm.IsSupportedArgument(argument))
{
std::cerr << gemm.GetTypeString() << " does not support this problem" << std::endl;
return true;
}
float ave_time = invoker.Run(argument, StreamConfig{nullptr, config.time_kernel});
std::size_t flop = 2_uz * M * N * K;
std::size_t num_btype =
sizeof(ADataType) * M * K + sizeof(BDataType) * K * N + sizeof(CDataType) * M * N;
float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
float gb_per_sec = num_btype / 1.E6 / ave_time;
std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
<< gemm.GetTypeString() << std::endl;
if(config.do_verification)
{
auto ref_gemm = ReferenceGemmInstance{};
auto ref_invoker = ref_gemm.MakeInvoker();
auto ref_argument = ref_gemm.MakeArgument(
a_m_k, b_k_n, c_m_n_host_result, a_element_op, b_element_op, c_element_op);
ref_invoker.Run(ref_argument);
c_m_n_device_buf.FromDevice(c_m_n_device_result.mData.data());
return ck::utils::check_err(c_m_n_device_result, c_m_n_host_result);
}
return true;
}
bool run_gemm_example(int argc, char* argv[])
{
ProblemSize problem_size;
ExecutionConfig config;
return !parse_cmd_args(argc, argv, problem_size, config) || run_gemm(problem_size, config);
}
"""
def
emit
(
self
):
values
=
{
'type_a'
:
'ck::half_t'
,
}
template
=
self
.
gemm_devop_template
cf
=
open
(
"xx.cpp"
,
'w'
)
print
(
SubstituteTemplate
(
template
,
values
))
cf
.
write
(
SubstituteTemplate
(
template
,
values
))
cf
.
close
()
a
=
EmitGemmInstance
()
a
.
emit
()
python/ait_impl/generation/ex/normal/input.py
0 → 100644
View file @
f945f40a
#take in input for gemm from user, send it to example template
\ No newline at end of file
python/ait_impl/generation/ex/potential/Makefile
0 → 100644
View file @
f945f40a
CC
=
/opt/rocm/bin/hipcc
CK_PATH
=
/dockerx/composable_kernel/
CFLAGS
=
-O3
-std
=
c++17
-DCK_AMD_GPU_GFX90A
--offload-arch
=
gfx90a
-I
"
${CK_PATH}
/include"
-I
"
${CK_PATH}
/library/include"
-I
"
${CK_PATH}
/profiler/include"
OBJS
=
ex.o host_tensor.o device_memory.o
all
:
$(OBJS)
$(CC)
$(CFLAGS)
$(OBJS)
-o
ex
device_memory.o
:
../../library/src/utility/device_memory.cpp
$(CC)
$(CFLAGS)
-c
../../library/src/utility/device_memory.cpp
host_tensor.o
:
../../library/src/utility/host_tensor.cpp
$(CC)
$(CFLAGS)
-c
../../library/src/utility/host_tensor.cpp
\ No newline at end of file
python/ait_impl/generation/ex/shared/Makefile
0 → 100644
View file @
f945f40a
CFLAGS
=
-I
~/workspace/composable_kernel/include
-I
/opt/workspace/rocm-5.1.1/hip/include
-I
~/workspace/composable_kernel/include/
-I
~/workspace/composable_kernel/include/ck/
-I
~/workspace/composable_kernel/example/01_gemm/
-I
~/workspace/composable_kernel/library/include/
-I
~/workspace/composable_kernel/library/src/utility/
-I
~/workspace/composable_kernel/include/ck/problem_transform/
-I
~/workspace/composable_kernel/include/ck/tensor/
-I
~/workspace/composable_kernel/include/ck/tensor_description/
-I
~/workspace/composable_kernel/include/ck/tensor_operation/
-I
~/workspace/composable_kernel/include/ck/tensor_operation/gpu/block/
-I
~/workspace/composable_kernel/include/ck/tensor_operation/gpu/device/
-I
~/workspace/composable_kernel/include/ck/tensor_operation/gpu/device/impl/
-I
~/workspace/composable_kernel/include/ck/tensor_operation/gpu/element/
-I
~/workspace/composable_kernel/include/ck/tensor_operation/gpu/grid/
-I
~/workspace/composable_kernel/include/ck/tensor_operation/gpu/thread/
-I
~/workspace/composable_kernel/include/ck/tensor_operation/gpu/warp/
-I
~/workspace/composable_kernel/include/ck/host_utility
-I
/external/include/half/
-I
~/workspace/composable_kernel/library/include/ck/library/host/
-I
~/workspace/composable_kernel/library/include/ck/library/host_tensor/
-I
~/workspace/composable_kernel/library/include/ck/library/obselete_driver_offline/
-I
~/workspace/composable_kernel/library/include/ck/library/reference_tensor_operation/cpu/
-I
~/workspace/composable_kernel/library/include/ck/library/reference_tensor_operation/gpu/
-I
~/workspace/composable_kernel/library/include/ck/library/tensor_operation_instance/
-I
~/workspace/composable_kernel/library/include/ck/library/tensor_operation_instance/gpu/
" + "
reduce/
-I
~/workspace/composable_kernel/library/include/ck/library/tensor_op/
-I
~/workspace/composable_kernel/library/include/ck/library/utility/
-I
~/workspace/composable_kernel/profiler/include/
CXXFLAGS
=
-std
=
c++17
test.so
:
ex.o host_tensor.o device_memory.o
hipcc
-shared
$(CXXFLAGS)
$(CFLAGS)
ex.o host_tensor.o device_memory.o
-o
test.so
device_memory.o
:
../../../../../library/src/utility/device_memory.cpp
hipcc
-fPIC
-fvisibility
=
hidden
$(CXXFLAGS)
$(CFLAGS)
-c
../../../../../library/src/utility/device_memory.cpp
host_tensor.o
:
../../../../../library/src/utility/host_tensor.cpp
hipcc
-fPIC
-fvisibility
=
hidden
$(CXXFLAGS)
$(CFLAGS)
-c
../../../../../library/src/utility/host_tensor.cpp
ex.o
:
hipcc
-fPIC
-fvisibility
=
hidden
$(CXXFLAGS)
-w
/opt/rocm-5.3.0/amdgcn/bitcode/oclc_abi_version_400.bc
$(CFLAGS)
-L
/opt/rocm-5.3.0/rocrand
-lrocrand
-x
hip
-c
ex.cpp
python/ait_impl/generation/ex/shared/gemm_ex.py
0 → 100644
View file @
f945f40a
import
enum
import
os.path
import
shutil
import
functools
import
operator
import
collections
import
subprocess
import
re
def
SubstituteTemplate
(
template
,
values
):
text
=
template
changed
=
True
while
changed
:
changed
=
False
for
key
,
value
in
values
.
items
():
regex
=
"
\\
$
\\
{%s
\\
}"
%
key
newtext
=
re
.
sub
(
regex
,
value
,
text
)
if
newtext
!=
text
:
changed
=
True
text
=
newtext
return
text
class
EmitGemmInstance
:
def
__init__
(
self
):
self
.
make_template
=
"""
CFLAGS=-I ~/workspace/composable_kernel/include -I /opt/workspace/rocm-5.1.1/hip/include -I ~/workspace/composable_kernel/include/ -I ~/workspace/composable_kernel/include/ck/ -I ~/workspace/composable_kernel/example/01_gemm/ -I ~/workspace/composable_kernel/library/include/ -I ~/workspace/composable_kernel/library/src/utility/ -I ~/workspace/composable_kernel/include/ck/problem_transform/ -I ~/workspace/composable_kernel/include/ck/tensor/ -I ~/workspace/composable_kernel/include/ck/tensor_description/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/block/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/device/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/device/impl/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/element/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/grid/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/thread/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/warp/ -I ~/workspace/composable_kernel/include/ck/host_utility -I /external/include/half/ -I ~/workspace/composable_kernel/library/include/ck/library/host/ -I ~/workspace/composable_kernel/library/include/ck/library/host_tensor/ -I ~/workspace/composable_kernel/library/include/ck/library/obselete_driver_offline/ -I ~/workspace/composable_kernel/library/include/ck/library/reference_tensor_operation/cpu/ -I ~/workspace/composable_kernel/library/include/ck/library/reference_tensor_operation/gpu/ -I ~/workspace/composable_kernel/library/include/ck/library/tensor_operation_instance/ -I ~/workspace/composable_kernel/library/include/ck/library/tensor_operation_instance/gpu/" + "reduce/ -I ~/workspace/composable_kernel/library/include/ck/library/tensor_op/ -I ~/workspace/composable_kernel/library/include/ck/library/utility/ -I ~/workspace/composable_kernel/profiler/include/
CXXFLAGS = -std=c++17
gemm: ex.o host_tensor.o device_memory.o
hipcc $(CXXFLAGS) $(CFLAGS) ex.o host_tensor.o device_memory.o -o gemm
device_memory.o: ../../../../library/src/utility/device_memory.cpp
hipcc $(CXXFLAGS) $(CFLAGS) -c ../../../../library/src/utility/device_memory.cpp
host_tensor.o: ../../../../library/src/utility/host_tensor.cpp
hipcc $(CXXFLAGS) $(CFLAGS) -c ../../../../library/src/utility/host_tensor.cpp
ex.o:
hipcc -fPIC -fvisibility=hidden $(CXXFLAGS) -w /opt/rocm-5.3.0/amdgcn/bitcode/oclc_abi_version_400.bc $(CFLAGS) -L/opt/rocm-5.3.0/rocrand -lrocrand -x hip -c ex.cpp
"""
self
.
gemm_devop_template
=
"""
#pragma once
#include "common.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_gemm_dl.hpp"
using ADataType = ck::half_t;
using BDataType = ck::half_t;
using CDataType = ck::half_t;
using AccDataType = float;
using ALayout = Col;
using BLayout = Row;
using CLayout = Row;
using AElementOp = PassThrough;
using BElementOp = PassThrough;
using CElementOp = PassThrough;
static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmDl<
${type_a},
${type_b},
${type_c},
${type_acc},
${layout_a},
${layout_b},
${layout_c},
${elementwise_op_a},
${elementwise_op_b},
${elementwise_op_c},
${Gemm_spec},
${block_size},
${mperblock},
${nperblock},
${k0perblock},
${k1},
${m1perthread},
${n1perthread},
${kperthread},
${m1n1_thcluster_m1xs},
${m1n1_thcluster_n1xs},
${ABT_thread_slice_lengths_K0_M0_M1_K1},
${ABT_thread_cluster_lengths_K0_M0_M1_K1},
${ABT_thread_cluster_arrange_order},
${ABT_src_access_order},
${ABT_src_vec_tensor_lengths_K0_M0_M1_K1},
${ABT_src_vec_tensor_cont_dim_order},
${ABT_dst_vec_tensor_lengths_K0_M0_M1_K1},
${BBT_thread_slice_lengths_K0_N0_N1_K1},
${BBT_thread_cluster_lengths_K0_N0_N1_K1},
${BBT_thread_cluster_arrange_order},
${BBT_src_access_order},
${BBT_src_vec_tensor_lengths_K0_N0_N1_K1},
${BBT_src_vec_tensor_cont_dim_order},
${BBT_dst_vec_tensor_lengths_K0_N0_N1_K1},
${CTT_src_dst_access_order},
${CTT_src_dst_vec_dim},
${CTT_dst_scalar_per_vector}>;
using ReferenceGemmInstance = ck::tensor_operation::host::
ReferenceGemm<ADataType, BDataType, CDataType, AccDataType, AElementOp, BElementOp, CElementOp>;
bool run_gemm(const ProblemSize& problem_size, const ExecutionConfig& config)
{
using namespace ck::literals;
auto& [M, N, K, StrideA, StrideB, StrideC] = problem_size;
auto f_host_tensor_descriptor =
[](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
if constexpr(std::is_same_v<decltype(layout), ck::tensor_layout::gemm::RowMajor>)
{
return HostTensorDescriptor({row, col}, {stride, 1_uz});
}
else
{
return HostTensorDescriptor({row, col}, {1_uz, stride});
}
};
Tensor<${type_a}> a_m_k(f_host_tensor_descriptor(M, K, StrideA, ${layout_a}{}));
Tensor<${type_b}> b_k_n(f_host_tensor_descriptor(K, N, StrideB, ${layout_b}{}));
switch(config.init_method)
{
case 0: break;
case 1:
ck::utils::FillUniformDistributionIntegerValue<${type_a}>{-5.f, 5.f}(a_m_k);
ck::utils::FillUniformDistributionIntegerValue<${type_b}>{-5.f, 5.f}(b_k_n);
break;
default:
ck::utils::FillUniformDistribution<${type_a}>{-1.f, 1.f}(a_m_k);
ck::utils::FillUniformDistribution<${type_b}>{-1.f, 1.f}(b_k_n);
}
Tensor<${type_c}> c_m_n_host_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
Tensor<${type_c}> c_m_n_device_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
std::cout << "a_m_k: " << a_m_k.mDesc << std::endl;
std::cout << "b_k_n: " << b_k_n.mDesc << std::endl;
std::cout << "c_m_n: " << c_m_n_host_result.mDesc << std::endl;
DeviceMem a_m_k_device_buf(sizeof(${type_a}) * a_m_k.mDesc.GetElementSpaceSize());
DeviceMem b_k_n_device_buf(sizeof(${type_b}) * b_k_n.mDesc.GetElementSpaceSize());
DeviceMem c_m_n_device_buf(sizeof(${type_c}) * c_m_n_device_result.mDesc.GetElementSpaceSize());
a_m_k_device_buf.ToDevice(a_m_k.mData.data());
b_k_n_device_buf.ToDevice(b_k_n.mData.data());
auto a_element_op = ${elementwise_op_a}{};
auto b_element_op = ${elementwise_op_b}{};
auto c_element_op = ${elementwise_op_c}{};
// do GEMM
auto gemm = DeviceGemmInstance{};
auto invoker = gemm.MakeInvoker();
auto argument = gemm.MakeArgument(
static_cast<${type_a}*>(a_m_k_device_buf.GetDeviceBuffer()),
static_cast<${type_b}*>(b_k_n_device_buf.GetDeviceBuffer()),
static_cast<${type_c}*>(c_m_n_device_buf.GetDeviceBuffer()),
M,
N,
K,
StrideA,
StrideB,
StrideC,
a_element_op,
b_element_op,
c_element_op);
if(!gemm.IsSupportedArgument(argument))
{
std::cerr << gemm.GetTypeString() << " does not support this problem" << std::endl;
return true;
}
float ave_time = invoker.Run(argument, StreamConfig{nullptr, config.time_kernel});
std::size_t flop = 2_uz * M * N * K;
std::size_t num_btype =
sizeof(${type_a}) * M * K + sizeof(${type_b}) * K * N + sizeof(${type_c}) * M * N;
float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
float gb_per_sec = num_btype / 1.E6 / ave_time;
std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
<< gemm.GetTypeString() << std::endl;
if(config.do_verification)
{
auto ref_gemm = ReferenceGemmInstance{};
auto ref_invoker = ref_gemm.MakeInvoker();
auto ref_argument = ref_gemm.MakeArgument(
a_m_k, b_k_n, c_m_n_host_result, a_element_op, b_element_op, c_element_op);
ref_invoker.Run(ref_argument);
c_m_n_device_buf.FromDevice(c_m_n_device_result.mData.data());
return ck::utils::check_err(c_m_n_device_result, c_m_n_host_result);
}
return true;
}
bool run_gemm_example(int argc, char* argv[])
{
ProblemSize problem_size;
ExecutionConfig config;
return !parse_cmd_args(argc, argv, problem_size, config) || run_gemm(problem_size, config);
}
int main(int argc, char* argv[]) { return !run_gemm_example(argc, argv); }
"""
def
emit
(
self
):
values
=
{
'type_a'
:
'ck::half_t'
,
'type_b'
:
'ck::half_t'
,
'type_c'
:
'ck::half_t'
,
'type_acc'
:
'float'
,
'layout_a'
:
'ck::tensor_layout::gemm::ColumnMajor'
,
'layout_b'
:
'ck::tensor_layout::gemm::RowMajor'
,
'layout_c'
:
'ck::tensor_layout::gemm::RowMajor'
,
'elementwise_op_a'
:
'ck::tensor_operation::element_wise::PassThrough'
,
'elementwise_op_b'
:
'ck::tensor_operation::element_wise::PassThrough'
,
'elementwise_op_c'
:
'ck::tensor_operation::element_wise::PassThrough'
,
'Gemm_spec'
:
'ck::tensor_operation::device::GemmSpecialization::Default'
,
'block_size'
:
'256'
,
'mperblock'
:
'128'
,
'nperblock'
:
'128'
,
'k0perblock'
:
'16'
,
'k1'
:
'2'
,
'm1perthread'
:
'4'
,
'n1perthread'
:
'4'
,
'kperthread'
:
'1'
,
'm1n1_thcluster_m1xs'
:
'S<8, 2>'
,
'm1n1_thcluster_n1xs'
:
'S<8, 2>'
,
'ABT_thread_slice_lengths_K0_M0_M1_K1'
:
'S<2, 1, 4, 2>'
,
'ABT_thread_cluster_lengths_K0_M0_M1_K1'
:
'S<8, 1, 32, 1>'
,
'ABT_thread_cluster_arrange_order'
:
'S<0, 3, 1, 2>'
,
'ABT_src_access_order'
:
'S<0, 3, 1, 2>'
,
'ABT_src_vec_tensor_lengths_K0_M0_M1_K1'
:
'S<1, 1, 4, 1>'
,
'ABT_src_vec_tensor_cont_dim_order'
:
'S<0, 3, 1, 2>'
,
'ABT_dst_vec_tensor_lengths_K0_M0_M1_K1'
:
'S<1, 1, 4, 2>'
,
'BBT_thread_slice_lengths_K0_N0_N1_K1'
:
'S<2, 1, 4, 2>'
,
'BBT_thread_cluster_lengths_K0_N0_N1_K1'
:
'S<8, 1, 32, 1>'
,
'BBT_thread_cluster_arrange_order'
:
'S<0, 3, 1, 2>'
,
'BBT_src_access_order'
:
'S<0, 3, 1, 2>'
,
'BBT_src_vec_tensor_lengths_K0_N0_N1_K1'
:
'S<1, 1, 4, 1>'
,
'BBT_src_vec_tensor_cont_dim_order'
:
'S<0, 3, 1, 2>'
,
'BBT_dst_vec_tensor_lengths_K0_N0_N1_K1'
:
'S<1, 1, 4, 2>'
,
'CTT_src_dst_access_order'
:
'S<0, 1, 2, 3, 4, 5>'
,
'CTT_src_dst_vec_dim'
:
'5'
,
'CTT_dst_scalar_per_vector'
:
'4'
}
template
=
self
.
gemm_devop_template
cf
=
open
(
"ex.cpp"
,
'w'
)
print
(
SubstituteTemplate
(
template
,
values
))
cf
.
write
(
SubstituteTemplate
(
template
,
values
))
cf
.
close
()
m_template
=
self
.
make_template
cf
=
open
(
"Makefile"
,
'w'
)
print
(
SubstituteTemplate
(
m_template
,
values
))
cf
.
write
(
SubstituteTemplate
(
m_template
,
values
))
cf
.
close
()
PIPE
=
-
1
STDOUT
=
-
2
proc
=
subprocess
.
Popen
(
[
"make"
],
shell
=
True
,
env
=
os
.
environ
.
copy
(),
stdout
=
subprocess
.
PIPE
,
stderr
=
subprocess
.
PIPE
,
)
out
,
err
=
proc
.
communicate
()
a
=
EmitGemmInstance
()
a
.
emit
()
python/ait_impl/generation/ex/shared/gemm_ex_code.py
0 → 100644
View file @
f945f40a
import
enum
import
os.path
import
shutil
import
functools
import
operator
import
collections
import
re
def
SubstituteTemplate
(
template
,
values
):
text
=
template
changed
=
True
while
changed
:
changed
=
False
for
key
,
value
in
values
.
items
():
regex
=
"
\\
$
\\
{%s
\\
}"
%
key
newtext
=
re
.
sub
(
regex
,
value
,
text
)
if
newtext
!=
text
:
changed
=
True
text
=
newtext
return
text
class
EmitGemmInstance
:
def
__init__
(
self
):
self
.
gemm_devop_template
=
"""
#pragma once
#include "common.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_gemm_dl.hpp"
using ADataType = ck::half_t;
using BDataType = ck::half_t;
using CDataType = ck::half_t;
using AccDataType = float;
using ALayout = Col;
using BLayout = Row;
using CLayout = Row;
using AElementOp = PassThrough;
using BElementOp = PassThrough;
using CElementOp = PassThrough;
static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmDl
<ADataType,
BDataType,
CDataType,
AccDataType,
ALayout,
BLayout,
CLayout,
AElementOp,
BElementOp,
CElementOp,
GemmDefault,
256,
128,
128,
16,
2,
4,
4,
1,
S<8, 2>,
S<8, 2>,
S<2, 1, 4, 2>,
S<8, 1, 32, 1>,
S<0, 3, 1, 2>,
S<0, 3, 1, 2>,
S<1, 1, 4, 1>,
S<0, 3, 1, 2>,
S<1, 1, 4, 2>,
S<2, 1, 4, 2>,
S<8, 1, 32, 1>,
S<0, 3, 1, 2>,
S<0, 3, 1, 2>,
S<1, 1, 4, 1>,
S<0, 3, 1, 2>,
S<1, 1, 4, 2>,
S<0, 1, 2, 3, 4, 5>,
5,
4>;
bool run_gemm(const ProblemSize& problem_size, const ExecutionConfig& config)
{
using namespace ck::literals;
auto& [M, N, K, StrideA, StrideB, StrideC] = problem_size;
auto f_host_tensor_descriptor =
[](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
if constexpr(std::is_same_v<decltype(layout), ck::tensor_layout::gemm::RowMajor>)
{
return HostTensorDescriptor({row, col}, {stride, 1_uz});
}
else
{
return HostTensorDescriptor({row, col}, {1_uz, stride});
}
};
Tensor<ADataType> a_m_k(f_host_tensor_descriptor(M, K, StrideA, ALayout{}));
Tensor<BDataType> b_k_n(f_host_tensor_descriptor(K, N, StrideB, BLayout{}));
switch(config.init_method)
{
case 0: break;
case 1:
ck::utils::FillUniformDistributionIntegerValue<ADataType>{-5.f, 5.f}(a_m_k);
ck::utils::FillUniformDistributionIntegerValue<BDataType>{-5.f, 5.f}(b_k_n);
break;
default:
ck::utils::FillUniformDistribution<ADataType>{-1.f, 1.f}(a_m_k);
ck::utils::FillUniformDistribution<BDataType>{-1.f, 1.f}(b_k_n);
}
Tensor<CDataType> c_m_n_host_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
Tensor<CDataType> c_m_n_device_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
std::cout << "a_m_k: " << a_m_k.mDesc << std::endl;
std::cout << "b_k_n: " << b_k_n.mDesc << std::endl;
std::cout << "c_m_n: " << c_m_n_host_result.mDesc << std::endl;
DeviceMem a_m_k_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpaceSize());
DeviceMem b_k_n_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpaceSize());
DeviceMem c_m_n_device_buf(sizeof(CDataType) * c_m_n_device_result.mDesc.GetElementSpaceSize());
a_m_k_device_buf.ToDevice(a_m_k.mData.data());
b_k_n_device_buf.ToDevice(b_k_n.mData.data());
auto a_element_op = AElementOp{};
auto b_element_op = BElementOp{};
auto c_element_op = CElementOp{};
// do GEMM
auto gemm = DeviceGemmInstance{};
auto invoker = gemm.MakeInvoker();
auto argument = gemm.MakeArgument(
static_cast<ADataType*>(a_m_k_device_buf.GetDeviceBuffer()),
static_cast<BDataType*>(b_k_n_device_buf.GetDeviceBuffer()),
static_cast<CDataType*>(c_m_n_device_buf.GetDeviceBuffer()),
M,
N,
K,
StrideA,
StrideB,
StrideC,
a_element_op,
b_element_op,
c_element_op);
if(!gemm.IsSupportedArgument(argument))
{
std::cerr << gemm.GetTypeString() << " does not support this problem" << std::endl;
return true;
}
float ave_time = invoker.Run(argument, StreamConfig{nullptr, config.time_kernel});
std::size_t flop = 2_uz * M * N * K;
std::size_t num_btype =
sizeof(ADataType) * M * K + sizeof(BDataType) * K * N + sizeof(CDataType) * M * N;
float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
float gb_per_sec = num_btype / 1.E6 / ave_time;
std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
<< gemm.GetTypeString() << std::endl;
if(config.do_verification)
{
auto ref_gemm = ReferenceGemmInstance{};
auto ref_invoker = ref_gemm.MakeInvoker();
auto ref_argument = ref_gemm.MakeArgument(
a_m_k, b_k_n, c_m_n_host_result, a_element_op, b_element_op, c_element_op);
ref_invoker.Run(ref_argument);
c_m_n_device_buf.FromDevice(c_m_n_device_result.mData.data());
return ck::utils::check_err(c_m_n_device_result, c_m_n_host_result);
}
return true;
}
bool run_gemm_example(int argc, char* argv[])
{
ProblemSize problem_size;
ExecutionConfig config;
return !parse_cmd_args(argc, argv, problem_size, config) || run_gemm(problem_size, config);
}
"""
def
emit
(
self
):
values
=
{
'type_a'
:
'ck::half_t'
,
}
template
=
self
.
gemm_devop_template
cf
=
open
(
"xx.cpp"
,
'w'
)
print
(
SubstituteTemplate
(
template
,
values
))
cf
.
write
(
SubstituteTemplate
(
template
,
values
))
cf
.
close
()
a
=
EmitGemmInstance
()
a
.
emit
()
python/ait_impl/generation/ex/shared/input.py
0 → 100644
View file @
f945f40a
#take in input for gemm from user, send it to example template
\ No newline at end of file
python/ait_impl/generation/gemm_dev_op.py
deleted
100644 → 0
View file @
3af15212
import
enum
import
os.path
import
shutil
import
functools
import
operator
import
collections
import
re
def
SubstituteTemplate
(
template
,
values
):
text
=
template
changed
=
True
while
changed
:
changed
=
False
for
key
,
value
in
values
.
items
():
regex
=
"
\\
$
\\
{%s
\\
}"
%
key
newtext
=
re
.
sub
(
regex
,
value
,
text
)
if
newtext
!=
text
:
changed
=
True
text
=
newtext
return
text
class
EmitGemmInstance
:
def
__init__
(
self
):
self
.
gemm_devop_template
=
"""
#pragma once
#include <iostream>
#include <sstream>
#include "ck/utility/common_header.hpp"
#include "ck/tensor_description/tensor_descriptor.hpp"
#include "ck/tensor_description/tensor_descriptor_helper.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/device_gemm.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "ck/tensor_operation/gpu/grid/gridwise_gemm_dl_v1r3.hpp"
#include "ck/host_utility/device_prop.hpp"
#include "ck/host_utility/kernel_launch.hpp"
namespace ck {
namespace tensor_operation {
namespace device {
template <
typename ${type_a},
typename ${type_b},
typename ${type_c},
typename ${type_acc},
typename ${layout_a},
typename ${layout_b},
typename ${layout_c},
typename ${elementwise_op_a},
typename ${elementwise_op_b},
typename ${elementwise_op_c},
${Gemm_spec},
${block_size},
${mperblock},
${nperblock},
${k0perblock},
${k1},
${m1perthread},
${n1perthread},
${kperthread},
typename ${m1n1_thcluster_m1xs},
typename ${m1n1_thcluster_n1xs},
typename ${ABT_thread_slice_lengths_K0_M0_M1_K1},
typename ${ABT_thread_cluster_lengths_K0_M0_M1_K1},
typename ${ABT_thread_cluster_arrange_order},
typename ${ABT_src_access_order},
typename ${ABT_src_vec_tensor_lengths_K0_M0_M1_K1},
typename ${ABT_src_vec_tensor_cont_dim_order},
typename ${ABT_dst_vec_tensor_lengths_K0_M0_M1_K1},
typename ${BBT_thread_slice_lengths_K0_N0_N1_K1},
typename ${BBT_thread_cluster_lengths_K0_N0_N1_K1},
typename ${BBT_thread_cluster_arrange_order},
typename ${BBT_src_access_order},
typename ${BBT_src_vec_tensor_lengths_K0_N0_N1_K1},
typename ${BBT_src_vec_tensor_cont_dim_order},
typename ${BBT_dst_vec_tensor_lengths_K0_N0_N1_K1},
typename ${CTT_src_dst_access_order},
${CTT_src_dst_vec_dim},
${CTT_dst_scalar_per_vector}>
struct DeviceGemmDl : public DeviceGemm<${layout_a},
${layout_b},
${layout_c},
${type_a},
${type_b},
${type_c},
${elementwise_op_a},
${elementwise_op_b},
${elementwise_op_c}>
{
static constexpr auto I0 = Number<0>{};
static constexpr auto I1 = Number<1>{};
static constexpr auto I2 = Number<2>{};
static constexpr auto I3 = Number<3>{};
static constexpr auto I4 = Number<4>{};
static constexpr auto I5 = Number<5>{};
static constexpr auto K1Number = Number<${k1}>{};
static auto MakeAGridDescriptor_K0_M_K1(index_t M, index_t K, index_t StrideA)
{
assert(K % ${k1} == 0);
const index_t K0 = K / ${k1};
const auto a_grid_desc_m_k = [&]() {
if constexpr(is_same<tensor_layout::gemm::RowMajor, ${layout_a}>::value)
{
return make_naive_tensor_descriptor(make_tuple(M, K), make_tuple(StrideA, I1));
}
else if constexpr(is_same<tensor_layout::gemm::ColumnMajor, ${layout_a}>::value)
{
return make_naive_tensor_descriptor(make_tuple(M, K), make_tuple(I1, StrideA));
}
}();
if constexpr(${Gemm_spec} == GemmSpecialization::MNPadding)
{
const auto PadM = (${mperblock} - M % ${mperblock}) % ${mperblock};
return transform_tensor_descriptor(
a_grid_desc_m_k,
make_tuple(make_unmerge_transform(make_tuple(K0, K1Number)),
make_right_pad_transform(M, PadM)),
make_tuple(Sequence<1>{}, Sequence<0>{}),
make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
}
else
{
return transform_tensor_descriptor(
a_grid_desc_m_k,
make_tuple(make_unmerge_transform(make_tuple(K0, K1Number)),
make_pass_through_transform(M)),
make_tuple(Sequence<1>{}, Sequence<0>{}),
make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
}
}
static auto MakeBGridDescriptor_K0_N_K1(index_t K, index_t N, index_t StrideB)
{
assert(K % ${k1} == 0);
const index_t K0 = K / ${k1};
const auto b_grid_desc_k_n = [&]() {
if constexpr(is_same<tensor_layout::gemm::RowMajor, ${layout_b}>::value)
{
return make_naive_tensor_descriptor(make_tuple(K, N), make_tuple(StrideB, I1));
}
else if constexpr(is_same<tensor_layout::gemm::ColumnMajor, ${layout_B}>::value)
{
return make_naive_tensor_descriptor(make_tuple(K, N), make_tuple(I1, StrideB));
}
}();
if constexpr(${Gemm_spec} == GemmSpecialization::MNPadding)
{
const auto PadN = (${nperblock} - N % ${nperblock}) % ${nperblock};
return transform_tensor_descriptor(
b_grid_desc_k_n,
make_tuple(make_unmerge_transform(make_tuple(K0, K1Number)),
make_right_pad_transform(N, PadN)),
make_tuple(Sequence<0>{}, Sequence<1>{}),
make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
}
else
{
return transform_tensor_descriptor(
b_grid_desc_k_n,
make_tuple(make_unmerge_transform(make_tuple(K0, K1Number)),
make_pass_through_transform(N)),
make_tuple(Sequence<0>{}, Sequence<1>{}),
make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
}
}
static auto MakeCGridDescriptor_M_N(index_t M, index_t N, index_t StrideC)
{
const auto c_grid_desc_m_n = [&]() {
if constexpr(is_same<tensor_layout::gemm::RowMajor, ${layout_c}>::value)
{
return make_naive_tensor_descriptor(make_tuple(M, N), make_tuple(StrideC, I1));
}
else if constexpr(is_same<tensor_layout::gemm::ColumnMajor, ${layout_c}>::value)
{
return make_naive_tensor_descriptor(make_tuple(M, N), make_tuple(I1, StrideC));
}
}();
if constexpr(${Gemm_spec} == GemmSpecialization::MNPadding)
{
const auto PadM = (${mperblock} - M % ${mperblock}) % ${mperblock};
const auto PadN = (${nperblock} - N % ${nperblock}) % ${nperblock};
return transform_tensor_descriptor(
c_grid_desc_m_n,
make_tuple(make_right_pad_transform(M, PadM), make_right_pad_transform(N, PadN)),
make_tuple(Sequence<0>{}, Sequence<1>{}),
make_tuple(Sequence<0>{}, Sequence<1>{}));
}
else
{
return transform_tensor_descriptor(
c_grid_desc_m_n,
make_tuple(make_pass_through_transform(M), make_pass_through_transform(N)),
make_tuple(Sequence<0>{}, Sequence<1>{}),
make_tuple(Sequence<0>{}, Sequence<1>{}));
}
}
using AGridDesc_K0_M_K1 = decltype(MakeAGridDescriptor_K0_M_K1(1, 1, 1));
using BGridDesc_K0_N_K1 = decltype(MakeBGridDescriptor_K0_N_K1(1, 1, 1));
using CGridDesc_M_N = decltype(MakeCGridDescriptor_M_N(1, 1, 1));
// GridwiseGemm
using GridwiseGemm =
GridwiseGemmDl_km_kn_mn_v1r3<BlockSize,
${type_a},
${type_acc},
${type_c},
InMemoryDataOperationEnum::Set,
AGridDesc_K0_M_K1,
BGridDesc_K0_N_K1,
CGridDesc_M_N,
${mperblock},
${nperblock},
${k0perblock},
${k1},
${m1perthread},
${n1perthread},
${kperthread},
${m1n1_thcluster_m1xs},
${m1n1_thcluster_n1xs},
${ABT_thread_slice_lengths_K0_M0_M1_K1},
${ABT_thread_cluster_lengths_K0_M0_M1_K1},
${ABT_thread_cluster_arrange_order},
${ABT_src_access_order},
${ABT_src_vec_tensor_lengths_K0_M0_M1_K1},
${ABT_src_vec_tensor_cont_dim_order},
${ABT_dst_vec_tensor_lengths_K0_M0_M1_K1},
${BBT_thread_slice_lengths_K0_N0_N1_K1},
${BBT_thread_cluster_lengths_K0_N0_N1_K1},
${BBT_thread_cluster_arrange_order},
${BBT_src_access_order},
${BBT_src_vec_tensor_lengths_K0_N0_N1_K1},
${BBT_src_vec_tensor_cont_dim_order},
${BBT_dst_vec_tensor_lengths_K0_N0_N1_K1},
${CTT_src_dst_access_order},
${CTT_src_dst_vec_dim},
${CTT_dst_scalar_per_vector}>;
using AGridDesc_K0_M0_M1_K1 =
decltype(GridwiseGemm::MakeAGridDescriptor_K0_M0_M1_K1(AGridDesc_K0_M_K1{}));
using BGridDesc_K0_N0_N1_K1 =
decltype(GridwiseGemm::MakeBGridDescriptor_K0_N0_N1_K1(BGridDesc_K0_N_K1{}));
using CGridDesc_M0_M10_M11_N0_N10_N11 =
decltype(GridwiseGemm::MakeCGridDescriptor_M0_M10_M11_N0_N10_N11(CGridDesc_M_N{}));
using DefaultBlock2CTileMap =
decltype(GridwiseGemm::MakeDefaultBlock2CTileMap(CGridDesc_M_N{}));
// Argument
struct Argument : public BaseArgument
{
Argument(const ${type_a}* p_a_grid,
const ${type_b}* p_b_grid,
${type_c}* p_c_grid,
index_t M,
index_t N,
index_t K,
index_t StrideA,
index_t StrideB,
index_t StrideC,
index_t M01,
index_t N01,
${elementwise_op_a} a_element_op,
${elementwise_op_b} b_element_op,
${elementwise_op_c} c_element_op)
: p_a_grid_{p_a_grid},
p_b_grid_{p_b_grid},
p_c_grid_{p_c_grid},
a_grid_desc_k0_m0_m1_k1_{},
b_grid_desc_k0_n0_n1_k1_{},
c_grid_desc_m0_m10_m11_n0_n10_n11_{},
block_2_ctile_map_{},
M01_{M01},
N01_{N01},
a_element_op_{a_element_op},
b_element_op_{b_element_op},
c_element_op_{c_element_op}
{
a_grid_desc_k0_m_k1_ = DeviceGemmDl::MakeAGridDescriptor_K0_M_K1(M, K, StrideA);
b_grid_desc_k0_n_k1_ = DeviceGemmDl::MakeBGridDescriptor_K0_N_K1(K, N, StrideB);
c_grid_desc_m_n_ = DeviceGemmDl::MakeCGridDescriptor_M_N(M, N, StrideC);
if(GridwiseGemm::CheckValidity(
a_grid_desc_k0_m_k1_, b_grid_desc_k0_n_k1_, c_grid_desc_m_n_))
{
a_grid_desc_k0_m0_m1_k1_ =
GridwiseGemm::MakeAGridDescriptor_K0_M0_M1_K1(a_grid_desc_k0_m_k1_);
b_grid_desc_k0_n0_n1_k1_ =
GridwiseGemm::MakeBGridDescriptor_K0_N0_N1_K1(b_grid_desc_k0_n_k1_);
c_grid_desc_m0_m10_m11_n0_n10_n11_ =
GridwiseGemm::MakeCGridDescriptor_M0_M10_M11_N0_N10_N11(c_grid_desc_m_n_);
block_2_ctile_map_ = GridwiseGemm::MakeDefaultBlock2CTileMap(c_grid_desc_m_n_);
}
}
// private:
const ${type_a}* p_a_grid_;
const ${type_b}* p_b_grid_;
${type_c}* p_c_grid_;
AGridDesc_K0_M_K1 a_grid_desc_k0_m_k1_;
BGridDesc_K0_N_K1 b_grid_desc_k0_n_k1_;
CGridDesc_M_N c_grid_desc_m_n_;
AGridDesc_K0_M0_M1_K1 a_grid_desc_k0_m0_m1_k1_;
BGridDesc_K0_N0_N1_K1 b_grid_desc_k0_n0_n1_k1_;
CGridDesc_M0_M10_M11_N0_N10_N11 c_grid_desc_m0_m10_m11_n0_n10_n11_;
DefaultBlock2CTileMap block_2_ctile_map_;
// TODO: unused, but may be useful in future.
index_t M01_;
index_t N01_;
// TODO: unused since gridwise_gemm_dl_v1r3 does NOT support prologue for the time being.
${elementwise_op_a} a_element_op_;
${elementwise_op_b} b_element_op_;
${elementwise_op_c} c_element_op_;
};
// Invoker
struct Invoker : public BaseInvoker
{
using Argument = DeviceGemmDl::Argument;
float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
{
{
std::cout << "arg.a_grid_desc_k0_m0_m1_k1_{"
<< arg.a_grid_desc_k0_m_k1_.GetLength(I0) << ", "
<< arg.a_grid_desc_k0_m_k1_.GetLength(I1) << ", "
<< arg.a_grid_desc_k0_m_k1_.GetLength(I2) << "}" << std::endl;
std::cout << "arg.b_grid_desc_k0_n0_n1_k1_{"
<< arg.b_grid_desc_k0_n_k1_.GetLength(I0) << ", "
<< arg.b_grid_desc_k0_n_k1_.GetLength(I1) << ", "
<< arg.b_grid_desc_k0_n_k1_.GetLength(I2) << "}" << std::endl;
std::cout << "arg.c_grid_desc_m_n_{ " << arg.c_grid_desc_m_n_.GetLength(I0) << ", "
<< arg.c_grid_desc_m_n_.GetLength(I1) << "}" << std::endl;
}
if(!GridwiseGemm::CheckValidity(
arg.a_grid_desc_k0_m_k1_, arg.b_grid_desc_k0_n_k1_, arg.c_grid_desc_m_n_))
{
throw std::runtime_error(
"wrong! GridwiseGemm_k0mk1_k0nk1_mn_xdl_v2r3 has invalid setting");
}
const index_t grid_size = GridwiseGemm::CalculateGridSize(
arg.c_grid_desc_m_n_.GetLength(I0), arg.c_grid_desc_m_n_.GetLength(I1));
const auto K0 = arg.a_grid_desc_k0_m0_m1_k1_.GetLength(I0);
const bool has_main_k_block_loop = GridwiseGemm::CalculateHasMainKBlockLoop(K0);
const bool has_double_tail_k_block_loop =
GridwiseGemm::CalculateHasDoubleTailKBlockLoop(K0);
float ave_time = 0;
if(has_main_k_block_loop && has_double_tail_k_block_loop)
{
const auto kernel =
kernel_gemm_dl_v1r3<GridwiseGemm,
${type_a},
${type_c},
remove_reference_t<AGridDesc_K0_M0_M1_K1>,
remove_reference_t<BGridDesc_K0_N0_N1_K1>,
remove_reference_t<CGridDesc_M0_M10_M11_N0_N10_N11>,
remove_reference_t<DefaultBlock2CTileMap>,
true,
true>;
ave_time = launch_and_time_kernel(stream_config,
kernel,
dim3(grid_size),
dim3(${block_size}),
0,
arg.p_a_grid_,
arg.p_b_grid_,
arg.p_c_grid_,
arg.a_grid_desc_k0_m0_m1_k1_,
arg.b_grid_desc_k0_n0_n1_k1_,
arg.c_grid_desc_m0_m10_m11_n0_n10_n11_,
arg.block_2_ctile_map_);
}
else if(has_main_k_block_loop && !has_double_tail_k_block_loop)
{
const auto kernel =
kernel_gemm_dl_v1r3<GridwiseGemm,
${type_a},
${type_c},
remove_reference_t<AGridDesc_K0_M0_M1_K1>,
remove_reference_t<BGridDesc_K0_N0_N1_K1>,
remove_reference_t<CGridDesc_M0_M10_M11_N0_N10_N11>,
remove_reference_t<DefaultBlock2CTileMap>,
true,
false>;
ave_time = launch_and_time_kernel(stream_config,
kernel,
dim3(grid_size),
dim3(${block_size}),
0,
arg.p_a_grid_,
arg.p_b_grid_,
arg.p_c_grid_,
arg.a_grid_desc_k0_m0_m1_k1_,
arg.b_grid_desc_k0_n0_n1_k1_,
arg.c_grid_desc_m0_m10_m11_n0_n10_n11_,
arg.block_2_ctile_map_);
}
else if(!has_main_k_block_loop && has_double_tail_k_block_loop)
{
const auto kernel =
kernel_gemm_dl_v1r3<GridwiseGemm,
${type_a},
${type_c},
remove_reference_t<AGridDesc_K0_M0_M1_K1>,
remove_reference_t<BGridDesc_K0_N0_N1_K1>,
remove_reference_t<CGridDesc_M0_M10_M11_N0_N10_N11>,
remove_reference_t<DefaultBlock2CTileMap>,
false,
true>;
ave_time = launch_and_time_kernel(stream_config,
kernel,
dim3(grid_size),
dim3(${block_size}),
0,
arg.p_a_grid_,
arg.p_b_grid_,
arg.p_c_grid_,
arg.a_grid_desc_k0_m0_m1_k1_,
arg.b_grid_desc_k0_n0_n1_k1_,
arg.c_grid_desc_m0_m10_m11_n0_n10_n11_,
arg.block_2_ctile_map_);
}
else
{
const auto kernel =
kernel_gemm_dl_v1r3<GridwiseGemm,
${type_a},
${type_c},
remove_reference_t<AGridDesc_K0_M0_M1_K1>,
remove_reference_t<BGridDesc_K0_N0_N1_K1>,
remove_reference_t<CGridDesc_M0_M10_M11_N0_N10_N11>,
remove_reference_t<DefaultBlock2CTileMap>,
false,
false>;
ave_time = launch_and_time_kernel(stream_config,
kernel,
dim3(grid_size),
dim3(${block_size}),
0,
arg.p_a_grid_,
arg.p_b_grid_,
arg.p_c_grid_,
arg.a_grid_desc_k0_m0_m1_k1_,
arg.b_grid_desc_k0_n0_n1_k1_,
arg.c_grid_desc_m0_m10_m11_n0_n10_n11_,
arg.block_2_ctile_map_);
}
return ave_time;
}
// polymorphic
float Run(const BaseArgument* p_arg,
const StreamConfig& stream_config = StreamConfig{}) override
{
return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
}
};
static constexpr bool IsValidCompilationParameter()
{
// TODO: properly implement this check
return true;
}
static bool IsSupportedArgument(const Argument& arg)
{
if(ck::get_device_name() == "gfx906" || ck::get_device_name() == "gfx1030")
{
return GridwiseGemm::CheckValidity(
arg.a_grid_desc_k0_m_k1_, arg.b_grid_desc_k0_n_k1_, arg.c_grid_desc_m_n_);
}
else
{
return false;
}
}
// polymorphic
bool IsSupportedArgument(const BaseArgument* p_arg) override
{
return IsSupportedArgument(*dynamic_cast<const Argument*>(p_arg));
}
static auto MakeArgument(const ${type_a}* p_a,
const ${type_b}* p_b,
${type_c}* p_c,
index_t M,
index_t N,
index_t K,
index_t StrideA,
index_t StrideB,
index_t StrideC,
${elementwise_op_a} a_element_op,
${elementwise_op_b} b_element_op,
${elementwise_op_c} c_element_op)
{
return Argument{p_a,
p_b,
p_c,
M,
N,
K,
StrideA,
StrideB,
StrideC,
1,
1,
a_element_op,
b_element_op,
c_element_op};
}
static auto MakeInvoker() { return Invoker{}; }
// polymorphic
std::unique_ptr<BaseArgument> MakeArgumentPointer(const void* p_a,
const void* p_b,
void* p_c,
index_t M,
index_t N,
index_t K,
index_t StrideA,
index_t StrideB,
index_t StrideC,
${elementwise_op_a} a_element_op,
${elementwise_op_b} b_element_op,
${elementwise_op_c} c_element_op) override
{
return std::make_unique<Argument>(static_cast<const ${type_a}*>(p_a),
static_cast<const ${type_b}*>(p_b),
static_cast<${type_c}*>(p_c),
M,
N,
K,
StrideA,
StrideB,
StrideC,
1,
1,
a_element_op,
b_element_op,
c_element_op);
}
// polymorphic
std::unique_ptr<BaseInvoker> MakeInvokerPointer() override
{
return std::make_unique<Invoker>(Invoker{});
}
// polymorphic
std::string GetTypeString() const override
{
auto str = std::stringstream();
// clang-format off
str << "DeviceGemmDl"
<< "<"
<< ${block_size} << ", "
<< ${mperblock} << ", "
<< ${nperblock} << ", "
<< ${k0perblock} << ", "
<< ${k1} << ", "
<< ${m1perthread} << ", "
<< ${n1perthread} << ", "
<< ${kperthread}
<< ">";
// clang-format on
return str.str();
}
};
} // namespace device
} // namespace tensor_operation
} // namespace ck
"""
def
emit
(
self
):
values
=
{
'type_a'
:
'ck::half_t'
,
'type_b'
:
'ck::half_t'
,
'type_c'
:
'ck::half_t'
,
'type_acc'
:
'float'
,
'layout_a'
:
'ck::tensor_layout::gemm::ColMajor'
,
'layout_b'
:
'ck::tensor_layout::gemm::RowMajor'
,
'layout_c'
:
'ck::tensor_layout::gemm::RowMajor'
,
'elementwise_op_a'
:
'ck::tensor_operation::element_wise::PassThrough'
,
'elementwise_op_b'
:
'ck::tensor_operation::element_wise::PassThrough'
,
'elementwise_op_c'
:
'ck::tensor_operation::element_wise::PassThrough'
,
'Gemm_spec'
:
'ck::tensor_operation::device::GemmSpecialization::Default'
,
'block_size'
:
'256'
,
'mperblock'
:
'128'
,
'nperblock'
:
'128'
,
'k0perblock'
:
'16'
,
'k1'
:
'2'
,
'm1perthread'
:
'4'
,
'n1perthread'
:
'4'
,
'kperthread'
:
'1'
,
'm1n1_thcluster_m1xs'
:
'S<8, 2>'
,
'm1n1_thcluster_n1xs'
:
'S<8, 2>'
,
'ABT_thread_slice_lengths_K0_M0_M1_K1'
:
'S<2, 1, 4, 2>'
,
'ABT_thread_cluster_lengths_K0_M0_M1_K1'
:
'S<8, 1, 32, 1>'
,
'ABT_thread_cluster_arrange_order'
:
'S<0, 3, 1, 2>'
,
'ABT_src_access_order'
:
'S<0, 3, 1, 2>'
,
'ABT_src_vec_tensor_lengths_K0_M0_M1_K1'
:
'S<1, 1, 4, 1>'
,
'ABT_src_vec_tensor_cont_dim_order'
:
'S<0, 3, 1, 2>'
,
'ABT_dst_vec_tensor_lengths_K0_M0_M1_K1'
:
'S<1, 1, 4, 2>'
,
'BBT_thread_slice_lengths_K0_N0_N1_K1'
:
'S<2, 1, 4, 2>'
,
'BBT_thread_cluster_lengths_K0_N0_N1_K1'
:
'S<8, 1, 32, 1>'
,
'BBT_thread_cluster_arrange_order'
:
'S<0, 3, 1, 2>'
,
'BBT_src_access_order'
:
'S<0, 3, 1, 2>'
,
'BBT_src_vec_tensor_lengths_K0_N0_N1_K1'
:
'S<1, 1, 4, 1>'
,
'BBT_src_vec_tensor_cont_dim_order'
:
'S<0, 3, 1, 2>'
,
'BBT_dst_vec_tensor_lengths_K0_N0_N1_K1'
:
'S<1, 1, 4, 2>'
,
'CTT_src_dst_access_order'
:
'S<0, 1, 2, 3, 4, 5>'
,
'CTT_src_dst_vec_dim'
:
'5'
,
'CTT_dst_scalar_per_vector'
:
'4'
}
template
=
self
.
gemm_devop_template
cf
=
open
(
"xx.cpp"
,
'w'
)
print
(
SubstituteTemplate
(
template
,
values
))
cf
.
write
(
SubstituteTemplate
(
template
,
values
))
cf
.
close
()
a
=
EmitGemmInstance
()
a
.
emit
()
python/ait_impl/generation/gemm_kernel.py
deleted
100644 → 0
View file @
3af15212
import
enum
import
os.path
import
shutil
import
functools
import
operator
import
collections
import
re
def
SubstituteTemplate
(
template
,
values
):
text
=
template
changed
=
True
while
changed
:
changed
=
False
for
key
,
value
in
values
.
items
():
regex
=
"
\\
$
\\
{%s
\\
}"
%
key
newtext
=
re
.
sub
(
regex
,
value
,
text
)
if
newtext
!=
text
:
changed
=
True
text
=
newtext
return
text
class
EmitGemmInstance
:
def
__init__
(
self
):
self
.
gemm_kernel_template
=
"""
template <typename GridwiseGemm,
typename FloatAB,
typename FloatC,
typename AGridDesc_K0_M0_M1_K1,
typename BGridDesc_K0_N0_N1_K1,
typename CGridDesc_M0_M10_M11_N0_N10_N11,
typename Block2CTileMap,
bool HasMainKBlockLoop,
bool HasDoubleTailKBlockLoop>
__global__ void
#if CK_USE_LAUNCH_BOUNDS
__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
#endif
kernel_gemm_dl_v1r3(const ${type_ab}* __restrict__ ${p_a_grid},
const ${type_ab}* __restrict__ ${p_b_grid},
${type_c}* __restrict__ ${p_c_grid},
const ${A_GridDesc_K0_M_K1} ${a_grid_desc_k0_m0_m1_k1},
const ${BGridDesc_K0_N_K1} ${b_grid_desc_k0_n0_n1_k1},
const ${CGridDesc_M0_M10_M11_N0_N10_N11} ${c_grid_desc_m0_m10_m11_n0_n10_n11},
const Block2CTileMap ${block_2_ctile_map})
{
constexpr index_t shared_block_size =
GridwiseGemm::GetSharedMemoryNumberOfByte() / sizeof(${type_ab});
__shared__ ${type_ab} p_shared_block[shared_block_size];
GridwiseGemm::Run(${p_a_grid},
${p_b_grid},
${p_c_grid},
p_shared_block,
${a_grid_desc_k0_m0_m1_k1},
${b_grid_desc_k0_n0_n1_k1},
${c_grid_desc_m0_m10_m11_n0_n10_n11},
${block_2_ctile_map},
integral_constant<bool, HasMainKBlockLoop>{},
integral_constant<bool, HasDoubleTailKBlockLoop>{});
}
template <index_t BlockSize,
${type_ab},
${type_acc},
${type_c},
InMemoryDataOperationEnum CGlobalMemoryDataOperation,
${A_GridDesc_K0_M_K1},
${BGridDesc_K0_N_K1},
${CGridDesc_M_N},
${mperblock},
${nperblock},
${k0perblock},
${k1value},
${M1PerThreadM111},
${N1PerThreadN111},
${KPerThread},
${M11N11ThreadClusterM110Xs},
${M11N11ThreadClusterN110Xs},
${ABlockTransferThreadSliceLengths_K0_M0_M1_K1},
${ABlockTransferThreadClusterLengths_K0_M0_M1_K1},
${ABlockTransferThreadClusterArrangeOrder},
${ABlockTransferSrcAccessOrder},
${ABlockTransferSrcVectorTensorLengths_K0_M0_M1_K1},
${ABlockTransferSrcVectorTensorContiguousDimOrder},
${ABlockTransferDstVectorTensorLengths_K0_M0_M1_K1},
${BBlockTransferThreadSliceLengths_K0_N0_N1_K1},
${BBlockTransferThreadClusterLengths_K0_N0_N1_K1},
${BBlockTransferThreadClusterArrangeOrder},
${BBlockTransferSrcAccessOrder},
${BBlockTransferSrcVectorTensorLengths_K0_N0_N1_K1},
${BBlockTransferSrcVectorTensorContiguousDimOrder},
${BBlockTransferDstVectorTensorLengths_K0_N0_N1_K1},
${CThreadTransferSrcDstAccessOrder},
${CThreadTransferSrcDstVectorDim},
${CThreadTransferDstScalarPerVector}>
struct GridwiseGemmDl_km_kn_mn_v1r3
{
static constexpr auto I0 = Number<0>{};
static constexpr auto I1 = Number<1>{};
static constexpr auto I2 = Number<2>{};
static constexpr auto I3 = Number<3>{};
// K1 should be Number<...>
static constexpr auto K1 = Number<K1Value>{};
__host__ __device__ static constexpr index_t GetSharedMemoryNumberOfByte()
{
// TODO: change this. I think it needs multi-dimensional alignment
constexpr auto max_lds_align = K1;
// TODO: check alignment
// A matrix in LDS memory, dst of blockwise copy
constexpr auto a_block_desc_k_m = make_naive_tensor_descriptor_aligned(
make_tuple(Number<K0PerBlock>{}, Number<MPerBlock>{}, K1), max_lds_align);
// TODO: check alignment
// B matrix in LDS memory, dst of blockwise copy
constexpr auto b_block_desc_k_n = make_naive_tensor_descriptor_aligned(
make_tuple(Number<K0PerBlock>{}, Number<NPerBlock>{}, K1), max_lds_align);
// TODO: check alignment
// LDS allocation for A and B: be careful of alignment
constexpr auto a_block_aligned_space_size =
math::integer_least_multiple(a_block_desc_k_m.GetElementSpaceSize(), max_lds_align);
constexpr auto b_block_aligned_space_size =
math::integer_least_multiple(b_block_desc_k_n.GetElementSpaceSize(), max_lds_align);
return 2 * (a_block_aligned_space_size + b_block_aligned_space_size) * sizeof(FloatAB);
}
"""
def
emit
(
self
):
values
=
{
'function_name'
:
"gemm"
,
'type_a'
:
'ck::half_t'
,
'type_b'
:
'ck::half_t'
,
'type_c'
:
'ck::half_t'
,
'type_acc'
:
'float'
,
'layout_a'
:
'ck::tensor_layout::gemm::RowMajor'
,
'layout_b'
:
'ck::tensor_layout::gemm::RowMajor'
,
'layout_c'
:
'ck::tensor_layout::gemm::RowMajor'
,
'elementwise_op_a'
:
'ck::tensor_operation::element_wise::PassThrough'
,
'elementwise_op_b'
:
'ck::tensor_operation::element_wise::PassThrough'
,
'elementwise_op_c'
:
'ck::tensor_operation::element_wise::PassThrough'
,
'Gemm_spec'
:
'ck::tensor_operation::device::GemmSpecialization::MNKPadding'
,
'block_size'
:
'256'
,
'mperblock'
:
'64'
,
'nperblock'
:
'128'
,
'kperblock'
:
'32'
,
'k1'
:
'8'
,
'mperxdl'
:
'32'
,
'nperxdl'
:
'32'
,
'mxdlperwave'
:
'1'
,
'nxdlperwave'
:
'2'
,
'threadclusterlength_a'
:
'ck::Sequence<4,64,1>'
,
'threadclusterarrange_a'
:
'ck::Sequence<1,0,2>'
,
'srcaccessorder_a'
:
'ck::Sequence<1,0,2>'
,
'srcvectordim_a'
:
'2'
,
'srcscalarpervec_a'
:
'8'
,
'dstscalarpervec_a'
:
'8'
,
'add_extra_dim_a'
:
'1'
,
'threadclusterlength_b'
:
'ck::Sequence<8,32,1>'
,
'threadclusterarrange_b'
:
'ck::Sequence<0,2,1>'
,
'srcaccessorder_b'
:
'ck::Sequence<0,2,1>'
,
'srcvectordim_b'
:
'1'
,
'srcscalarpervec_b'
:
'4'
,
'dstscalarpervec_b'
:
'2'
,
'add_extra_dim_b'
:
'0'
,
'dstscalarpervec_c'
:
'8'
}
template
=
self
.
gemm_template
print
(
SubstituteTemplate
(
template
,
values
))
\ No newline at end of file
python/ait_impl/generation/norm_ex.py
deleted
100644 → 0
View file @
3af15212
import
os
import
re
from
hashlib
import
sha1
from
typing
import
Any
,
Dict
,
OrderedDict
import
jinja2
#from ...target import Target
#templating
FUNC_CALL_PARAM_TEMPLATE
=
jinja2
.
Template
(
"(void *)({{name}})"
)
INSTANCE_TEMPLATE
=
jinja2
.
Template
(
"""
using {{name}} = {{ config_name }};
"""
)
ARGS_PARSE_TEMPLATE
=
jinja2
.
Template
(
"""
{% for idx in range(rank) %}
const int64_t in_{{idx}} = std::stoi(argv[{{ idx + 1 }}]);
{% endfor %}
"""
)
STRUCTS_DEF_TEMPLATE
=
jinja2
.
Template
(
"""
struct ProfilerMemoryPool {
ProfilerMemoryPool() {
std::random_device rd;
gen = std::mt19937(rd());
uniform_dist = std::uniform_int_distribution<int64_t>(1, 48964896);
offsets.reserve(512);
strides.reserve(512);
copies.reserve(512);
ptrs.reserve(512);
}
~ProfilerMemoryPool() {
for(int i = 0; i < ptrs.size(); i++){
hipFree(ptrs[i]);
}
}
template <typename DType>
DType* AllocateGaussianTensor(int64_t size) {
size_t length = size * sizeof(DType);
DType *d_x;
hipMalloc(&d_x, length);
float mean = 0.0f;
float stddev = 1.0f;
uint64_t seed = uniform_dist(gen);
rocrand_set_seed(generator, seed);
rocrand_generate_normal(generator, reinterpret_cast<float*>(d_x), size, mean, stddev);
return d_x;
}
ck::half_t* AllocateHalfGaussianTensor(int64_t size) {
return reinterpret_cast<ck::half_t*>(
AllocateGaussianTensor<ck::half_t>(size));
}
int AllocateHalfTensor(int64_t size, int64_t copy) {
offsets.push_back(0);
strides.push_back(size);
copies.push_back(copy);
auto ptr = AllocateHalfGaussianTensor(size * copy);
ptrs.push_back(reinterpret_cast<void*>(ptr));
return ptrs.size() - 1;
}
ck::half_t* RequestHalfTensorByIdx(int idx) {
auto copy = copies.at(idx);
auto offset = offsets.at(idx);
auto stride = strides.at(idx);
ck::half_t* ptr = reinterpret_cast<ck::half_t*>(ptrs.at(idx));
ptr += offset;
offset += stride;
if (offset == copy * stride) {
offset = 0;
}
offsets[idx] = offset;
return ptr;
}
std::vector<int64_t> offsets;
std::vector<int64_t> strides;
std::vector<int64_t> copies;
std::vector<void*> ptrs;
std::mt19937 gen;
std::uniform_int_distribution<int64_t> uniform_dist;
rocrand_generator generator;
};
// hack for DeviceMem linking error
// TODO fix this by making CK a header-only lib
// <<< hack begin
DeviceMem::DeviceMem(std::size_t mem_size) : mMemSize(mem_size)
{
hipGetErrorString(hipMalloc(static_cast<void**>(&mpDeviceBuf), mMemSize));
}
void* DeviceMem::GetDeviceBuffer() const { return mpDeviceBuf; }
void DeviceMem::ToDevice(const void* p) const
{
hipGetErrorString(
hipMemcpy(mpDeviceBuf, const_cast<void*>(p), mMemSize, hipMemcpyHostToDevice));
}
void DeviceMem::FromDevice(void* p) const
{
hipGetErrorString(hipMemcpy(p, mpDeviceBuf, mMemSize, hipMemcpyDeviceToHost));
}
DeviceMem::~DeviceMem() { hipGetErrorString(hipFree(mpDeviceBuf)); }
struct KernelTimerImpl
{
KernelTimerImpl() {
hipGetErrorString(hipEventCreate(&mStart));
hipGetErrorString(hipEventCreate(&mEnd));
}
~KernelTimerImpl() {
hipGetErrorString(hipEventDestroy(mStart));
hipGetErrorString(hipEventDestroy(mEnd));
}
void Start() {
hipGetErrorString(hipDeviceSynchronize());
hipGetErrorString(hipEventRecord(mStart, nullptr));
}
void End() {
hipGetErrorString(hipEventRecord(mEnd, nullptr));
hipGetErrorString(hipEventSynchronize(mEnd));
}
float GetElapsedTime() const {
float time;
hipGetErrorString(hipEventElapsedTime(&time, mStart, mEnd));
return time;
}
hipEvent_t mStart, mEnd;
};
// >>> hack end
"""
)
FUNC_TEMPLATE
=
jinja2
.
Template
(
"""
#include <iostream>
#include <numeric>
#include <initializer_list>
#include <cstdlib>
#include <stdlib.h>
#include <random>
#include <rocrand/rocrand.h>
#include "include/ck/utility/print.hpp"
#include "library/include/ck/library/utility/device_memory.hpp"
#include "library/include/ck/library/utility/host_tensor.hpp"
#include "library/include/ck/library/utility/host_tensor_generator.hpp"
#include "include/ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "include/ck/utility/reduction_operator.hpp"
{{extra_headers}}
{{extra_code}}
{{instances_decl}}
{{func_signature}}
{
{{shape_eval}}
{{exec_paths}}
}
"""
)
FUNC_CALL_TEMPLATE
=
jinja2
.
Template
(
"""
{{indent}}{{func_name}}(
{{indent}} {{input}},
{{indent}} {{output}},
{% for name in input_dim_names %}
{{indent}} const_cast<int64_t *>(&{{name}}),
{% endfor %}
{{indent}} stream
{{indent}});
"""
)
PROFILER_TEMPLATE
=
jinja2
.
Template
(
"""
size_t GLOBAL_WORKSPACE_SIZE = 0;
{{op_func}}
{{structs_def}}
int main(int argc, char** argv) {
{{args_parse}}
auto memory_pool = std::make_unique<ProfilerMemoryPool>();
hipStream_t stream = nullptr;
{{tensor_decl}}
// warmup
for(int i = 0; i < 3; ++i) {
{{func_call}}
}
// run
KernelTimerImpl timer;
timer.Start();
for(int i = 0; i < 5; ++i) {
{{func_call}}
}
timer.End();
std::cout << "WS:" <<GLOBAL_WORKSPACE_SIZE<<std::endl;
std::cout << "TIME:" << timer.GetElapsedTime() << std::endl;
}
"""
)
# rendering (messy, need to modularize and organize)
# def gen_profiler(
# shape_eval_template: jinja2.Template,
# exec_template: jinja2.Template,
# tensor_decl_template: jinja2.Template,
# extra_header_template: jinja2.Template,
# get_func_signature: Any,
# extra_code: str = "",
# func_call_template: jinja2.Template = FUNC_CALL_TEMPLATE,
# indent: str = " ",
# ) -> str:
# shape_eval_template: jinja2.Template
# exec_template: jinja2.Template
# tensor_decl_template: jinja2.Template
#extra_header_template: jinja2.Template
get_func_signature
:
Any
extra_code
:
str
=
""
func_call_template
:
jinja2
.
Template
=
FUNC_CALL_TEMPLATE
indent
:
str
=
" "
# shape_eval = shape_eval_template.render(rank=2) #if shape_eval_template else ""
# exe_path = exec_template.render(instance="DeviceInstance",dtype="void",reduce_dims=1,rank=2,eps=eps,)
instances
=
INSTANCE_TEMPLATE
.
render
(
name
=
"DeviceInstance"
,
config_name
=
"ck::tensor_operation::device::DeviceLayernormImpl"
,)
op_func
=
FUNC_TEMPLATE
.
render
(
instances_decl
=
instances
,
#func_signature=get_func_signature(func_attrs),
#shape_eval=shape_eval,
#exec_paths=exe_path,
#extra_headers=extra_header_template.render(),
extra_code
=
extra_code
,)
structs_def
=
STRUCTS_DEF_TEMPLATE
.
render
()
args_parse
=
ARGS_PARSE_TEMPLATE
.
render
(
rank
=
2
)
#tensor_decl = tensor_decl_template.render(rank=2)
input_dim_names
=
[
f
"in_
{
i
}
"
for
i
in
range
(
2
)]
func_call
=
func_call_template
.
render
(
func_name
=
"norm"
,
input
=
"(void *) memory_pool->RequestHalfTensorByIdx(0)"
,
gamma
=
"(void *) memory_pool->RequestHalfTensorByIdx(2)"
,
beta
=
"(void *) memory_pool->RequestHalfTensorByIdx(3)"
,
output
=
"(void *) memory_pool->RequestHalfTensorByIdx(1)"
,
input_dim_names
=
input_dim_names
,
indent
=
indent
,
)
code
=
PROFILER_TEMPLATE
.
render
(
op_func
=
op_func
,
structs_def
=
structs_def
,
args_parse
=
args_parse
,
#tensor_decl=tensor_decl,
func_call
=
func_call
,
)
# print(instances)
# print(args_parse)
# print(structs_def)
#print(func_call)
#print(op_func)
print
(
code
)
python/ait_impl/generation/permute_ex.py
deleted
100644 → 0
View file @
3af15212
import
jinja2
EXTRA_SHAPE_TEMPLATE
=
jinja2
.
Template
(
"""
{{indent}}const int64_t stride_a = *a_dim1;
{{indent}}const int64_t stride_b = *b_dim1;
{{indent}}const int64_t stride_c = *c_dim1;
ck::index_t M0 = M / G1 / G2;
ck::index_t M1 = G1;
ck::index_t M2 = G2;
ck::index_t N0 = G3;
ck::index_t N1 = N / G3;
// GEMM shape
//ck::index_t M = M0 * M1 * M2;
//ck::index_t N = N0 * N1;
//ck::index_t K = 128;
//ck::index_t stride_A = K;
//ck::index_t stride_B = K;
// E = [M0, N0, M1, N1, M2]
/* 0, 3, 1, 4, 2
ck::index_t stride_E_M0 = N0 * M1 * N1 * M2;
ck::index_t stride_E_M1 = N1 * M2;
ck::index_t stride_E_M2 = 1;
ck::index_t stride_E_N0 = M1 * N1 * M2;
ck::index_t stride_E_N1 = M2;
*/
// E = [M2, M0, N0, M1, N1] 2, 0, 3, 1, 4
ck::index_t stride_E_M0 = N0* M1* N1;
ck::index_t stride_E_M1 = N1;
ck::index_t stride_E_M2 = M0* N0* M1* N1;
ck::index_t stride_E_N0 = M1 * N1;
ck::index_t stride_E_N1 = 1;
// D = [0, N0, 0, N1, 0]
ck::index_t stride_D_M0 = 0;
ck::index_t stride_D_M1 = 0;
ck::index_t stride_D_M2 = 0;
ck::index_t stride_D_N0 = N1;
ck::index_t stride_D_N1 = 1;
"""
)
output
=
EXTRA_SHAPE_TEMPLATE
.
render
(
indent
=
" "
);
print
(
output
)
\ No newline at end of file
python/ait_impl/generation/xx.cpp
deleted
100644 → 0
View file @
3af15212
#pragma once
#include <iostream>
#include <sstream>
#include "ck/utility/common_header.hpp"
#include "ck/tensor_description/tensor_descriptor.hpp"
#include "ck/tensor_description/tensor_descriptor_helper.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/device_gemm.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "ck/tensor_operation/gpu/grid/gridwise_gemm_dl_v1r3.hpp"
#include "ck/host_utility/device_prop.hpp"
#include "ck/host_utility/kernel_launch.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
template
<
typename
ck
::
half_t
,
typename
ck
::
half_t
,
typename
ck
::
half_t
,
typename
float
,
typename
ck
::
tensor_layout
::
gemm
::
ColMajor
,
typename
ck
::
tensor_layout
::
gemm
::
RowMajor
,
typename
ck
::
tensor_layout
::
gemm
::
RowMajor
,
typename
ck
::
tensor_operation
::
element_wise
::
PassThrough
,
typename
ck
::
tensor_operation
::
element_wise
::
PassThrough
,
typename
ck
::
tensor_operation
::
element_wise
::
PassThrough
,
ck
::
tensor_operation
::
device
::
GemmSpecialization
::
Default
,
256
,
128
,
128
,
16
,
2
,
4
,
4
,
1
,
typename
S
<
8
,
2
>,
typename
S
<
8
,
2
>
,
typename
S
<
2
,
1
,
4
,
2
>
,
typename
S
<
8
,
1
,
32
,
1
>
,
typename
S
<
0
,
3
,
1
,
2
>
,
typename
S
<
0
,
3
,
1
,
2
>
,
typename
S
<
1
,
1
,
4
,
1
>
,
typename
S
<
0
,
3
,
1
,
2
>
,
typename
S
<
1
,
1
,
4
,
2
>
,
typename
S
<
2
,
1
,
4
,
2
>
,
typename
S
<
8
,
1
,
32
,
1
>
,
typename
S
<
0
,
3
,
1
,
2
>
,
typename
S
<
0
,
3
,
1
,
2
>
,
typename
S
<
1
,
1
,
4
,
1
>
,
typename
S
<
0
,
3
,
1
,
2
>
,
typename
S
<
1
,
1
,
4
,
2
>
,
typename
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
4
>
struct
DeviceGemmDl
:
public
DeviceGemm
<
ck
::
tensor_layout
::
gemm
::
ColMajor
,
ck
::
tensor_layout
::
gemm
::
RowMajor
,
ck
::
tensor_layout
::
gemm
::
RowMajor
,
ck
::
half_t
,
ck
::
half_t
,
ck
::
half_t
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
>
{
static
constexpr
auto
I0
=
Number
<
0
>
{};
static
constexpr
auto
I1
=
Number
<
1
>
{};
static
constexpr
auto
I2
=
Number
<
2
>
{};
static
constexpr
auto
I3
=
Number
<
3
>
{};
static
constexpr
auto
I4
=
Number
<
4
>
{};
static
constexpr
auto
I5
=
Number
<
5
>
{};
static
constexpr
auto
K1Number
=
Number
<
2
>
{};
static
auto
MakeAGridDescriptor_K0_M_K1
(
index_t
M
,
index_t
K
,
index_t
StrideA
)
{
assert
(
K
%
2
==
0
);
const
index_t
K0
=
K
/
2
;
const
auto
a_grid_desc_m_k
=
[
&
]()
{
if
constexpr
(
is_same
<
tensor_layout
::
gemm
::
RowMajor
,
ck
::
tensor_layout
::
gemm
::
ColMajor
>::
value
)
{
return
make_naive_tensor_descriptor
(
make_tuple
(
M
,
K
),
make_tuple
(
StrideA
,
I1
));
}
else
if
constexpr
(
is_same
<
tensor_layout
::
gemm
::
ColumnMajor
,
ck
::
tensor_layout
::
gemm
::
ColMajor
>::
value
)
{
return
make_naive_tensor_descriptor
(
make_tuple
(
M
,
K
),
make_tuple
(
I1
,
StrideA
));
}
}();
if
constexpr
(
ck
::
tensor_operation
::
device
::
GemmSpecialization
::
Default
==
GemmSpecialization
::
MNPadding
)
{
const
auto
PadM
=
(
128
-
M
%
128
)
%
128
;
return
transform_tensor_descriptor
(
a_grid_desc_m_k
,
make_tuple
(
make_unmerge_transform
(
make_tuple
(
K0
,
K1Number
)),
make_right_pad_transform
(
M
,
PadM
)),
make_tuple
(
Sequence
<
1
>
{},
Sequence
<
0
>
{}),
make_tuple
(
Sequence
<
0
,
2
>
{},
Sequence
<
1
>
{}));
}
else
{
return
transform_tensor_descriptor
(
a_grid_desc_m_k
,
make_tuple
(
make_unmerge_transform
(
make_tuple
(
K0
,
K1Number
)),
make_pass_through_transform
(
M
)),
make_tuple
(
Sequence
<
1
>
{},
Sequence
<
0
>
{}),
make_tuple
(
Sequence
<
0
,
2
>
{},
Sequence
<
1
>
{}));
}
}
static
auto
MakeBGridDescriptor_K0_N_K1
(
index_t
K
,
index_t
N
,
index_t
StrideB
)
{
assert
(
K
%
2
==
0
);
const
index_t
K0
=
K
/
2
;
const
auto
b_grid_desc_k_n
=
[
&
]()
{
if
constexpr
(
is_same
<
tensor_layout
::
gemm
::
RowMajor
,
ck
::
tensor_layout
::
gemm
::
RowMajor
>::
value
)
{
return
make_naive_tensor_descriptor
(
make_tuple
(
K
,
N
),
make_tuple
(
StrideB
,
I1
));
}
else
if
constexpr
(
is_same
<
tensor_layout
::
gemm
::
ColumnMajor
,
$
{
layout_B
}
>::
value
)
{
return
make_naive_tensor_descriptor
(
make_tuple
(
K
,
N
),
make_tuple
(
I1
,
StrideB
));
}
}();
if
constexpr
(
ck
::
tensor_operation
::
device
::
GemmSpecialization
::
Default
==
GemmSpecialization
::
MNPadding
)
{
const
auto
PadN
=
(
128
-
N
%
128
)
%
128
;
return
transform_tensor_descriptor
(
b_grid_desc_k_n
,
make_tuple
(
make_unmerge_transform
(
make_tuple
(
K0
,
K1Number
)),
make_right_pad_transform
(
N
,
PadN
)),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}),
make_tuple
(
Sequence
<
0
,
2
>
{},
Sequence
<
1
>
{}));
}
else
{
return
transform_tensor_descriptor
(
b_grid_desc_k_n
,
make_tuple
(
make_unmerge_transform
(
make_tuple
(
K0
,
K1Number
)),
make_pass_through_transform
(
N
)),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}),
make_tuple
(
Sequence
<
0
,
2
>
{},
Sequence
<
1
>
{}));
}
}
static
auto
MakeCGridDescriptor_M_N
(
index_t
M
,
index_t
N
,
index_t
StrideC
)
{
const
auto
c_grid_desc_m_n
=
[
&
]()
{
if
constexpr
(
is_same
<
tensor_layout
::
gemm
::
RowMajor
,
ck
::
tensor_layout
::
gemm
::
RowMajor
>::
value
)
{
return
make_naive_tensor_descriptor
(
make_tuple
(
M
,
N
),
make_tuple
(
StrideC
,
I1
));
}
else
if
constexpr
(
is_same
<
tensor_layout
::
gemm
::
ColumnMajor
,
ck
::
tensor_layout
::
gemm
::
RowMajor
>::
value
)
{
return
make_naive_tensor_descriptor
(
make_tuple
(
M
,
N
),
make_tuple
(
I1
,
StrideC
));
}
}();
if
constexpr
(
ck
::
tensor_operation
::
device
::
GemmSpecialization
::
Default
==
GemmSpecialization
::
MNPadding
)
{
const
auto
PadM
=
(
128
-
M
%
128
)
%
128
;
const
auto
PadN
=
(
128
-
N
%
128
)
%
128
;
return
transform_tensor_descriptor
(
c_grid_desc_m_n
,
make_tuple
(
make_right_pad_transform
(
M
,
PadM
),
make_right_pad_transform
(
N
,
PadN
)),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}));
}
else
{
return
transform_tensor_descriptor
(
c_grid_desc_m_n
,
make_tuple
(
make_pass_through_transform
(
M
),
make_pass_through_transform
(
N
)),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}));
}
}
using
AGridDesc_K0_M_K1
=
decltype
(
MakeAGridDescriptor_K0_M_K1
(
1
,
1
,
1
));
using
BGridDesc_K0_N_K1
=
decltype
(
MakeBGridDescriptor_K0_N_K1
(
1
,
1
,
1
));
using
CGridDesc_M_N
=
decltype
(
MakeCGridDescriptor_M_N
(
1
,
1
,
1
));
// GridwiseGemm
using
GridwiseGemm
=
GridwiseGemmDl_km_kn_mn_v1r3
<
BlockSize
,
ck
::
half_t
,
float
,
ck
::
half_t
,
InMemoryDataOperationEnum
::
Set
,
AGridDesc_K0_M_K1
,
BGridDesc_K0_N_K1
,
CGridDesc_M_N
,
128
,
128
,
16
,
2
,
4
,
4
,
1
,
S
<
8
,
2
>
,
S
<
8
,
2
>
,
S
<
2
,
1
,
4
,
2
>
,
S
<
8
,
1
,
32
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
2
,
1
,
4
,
2
>
,
S
<
8
,
1
,
32
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
4
>
;
using
AGridDesc_K0_M0_M1_K1
=
decltype
(
GridwiseGemm
::
MakeAGridDescriptor_K0_M0_M1_K1
(
AGridDesc_K0_M_K1
{}));
using
BGridDesc_K0_N0_N1_K1
=
decltype
(
GridwiseGemm
::
MakeBGridDescriptor_K0_N0_N1_K1
(
BGridDesc_K0_N_K1
{}));
using
CGridDesc_M0_M10_M11_N0_N10_N11
=
decltype
(
GridwiseGemm
::
MakeCGridDescriptor_M0_M10_M11_N0_N10_N11
(
CGridDesc_M_N
{}));
using
DefaultBlock2CTileMap
=
decltype
(
GridwiseGemm
::
MakeDefaultBlock2CTileMap
(
CGridDesc_M_N
{}));
// Argument
struct
Argument
:
public
BaseArgument
{
Argument
(
const
ck
::
half_t
*
p_a_grid
,
const
ck
::
half_t
*
p_b_grid
,
ck
::
half_t
*
p_c_grid
,
index_t
M
,
index_t
N
,
index_t
K
,
index_t
StrideA
,
index_t
StrideB
,
index_t
StrideC
,
index_t
M01
,
index_t
N01
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
a_element_op
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
b_element_op
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
c_element_op
)
:
p_a_grid_
{
p_a_grid
},
p_b_grid_
{
p_b_grid
},
p_c_grid_
{
p_c_grid
},
a_grid_desc_k0_m0_m1_k1_
{},
b_grid_desc_k0_n0_n1_k1_
{},
c_grid_desc_m0_m10_m11_n0_n10_n11_
{},
block_2_ctile_map_
{},
M01_
{
M01
},
N01_
{
N01
},
a_element_op_
{
a_element_op
},
b_element_op_
{
b_element_op
},
c_element_op_
{
c_element_op
}
{
a_grid_desc_k0_m_k1_
=
DeviceGemmDl
::
MakeAGridDescriptor_K0_M_K1
(
M
,
K
,
StrideA
);
b_grid_desc_k0_n_k1_
=
DeviceGemmDl
::
MakeBGridDescriptor_K0_N_K1
(
K
,
N
,
StrideB
);
c_grid_desc_m_n_
=
DeviceGemmDl
::
MakeCGridDescriptor_M_N
(
M
,
N
,
StrideC
);
if
(
GridwiseGemm
::
CheckValidity
(
a_grid_desc_k0_m_k1_
,
b_grid_desc_k0_n_k1_
,
c_grid_desc_m_n_
))
{
a_grid_desc_k0_m0_m1_k1_
=
GridwiseGemm
::
MakeAGridDescriptor_K0_M0_M1_K1
(
a_grid_desc_k0_m_k1_
);
b_grid_desc_k0_n0_n1_k1_
=
GridwiseGemm
::
MakeBGridDescriptor_K0_N0_N1_K1
(
b_grid_desc_k0_n_k1_
);
c_grid_desc_m0_m10_m11_n0_n10_n11_
=
GridwiseGemm
::
MakeCGridDescriptor_M0_M10_M11_N0_N10_N11
(
c_grid_desc_m_n_
);
block_2_ctile_map_
=
GridwiseGemm
::
MakeDefaultBlock2CTileMap
(
c_grid_desc_m_n_
);
}
}
// private:
const
ck
::
half_t
*
p_a_grid_
;
const
ck
::
half_t
*
p_b_grid_
;
ck
::
half_t
*
p_c_grid_
;
AGridDesc_K0_M_K1
a_grid_desc_k0_m_k1_
;
BGridDesc_K0_N_K1
b_grid_desc_k0_n_k1_
;
CGridDesc_M_N
c_grid_desc_m_n_
;
AGridDesc_K0_M0_M1_K1
a_grid_desc_k0_m0_m1_k1_
;
BGridDesc_K0_N0_N1_K1
b_grid_desc_k0_n0_n1_k1_
;
CGridDesc_M0_M10_M11_N0_N10_N11
c_grid_desc_m0_m10_m11_n0_n10_n11_
;
DefaultBlock2CTileMap
block_2_ctile_map_
;
// TODO: unused, but may be useful in future.
index_t
M01_
;
index_t
N01_
;
// TODO: unused since gridwise_gemm_dl_v1r3 does NOT support prologue for the time being.
ck
::
tensor_operation
::
element_wise
::
PassThrough
a_element_op_
;
ck
::
tensor_operation
::
element_wise
::
PassThrough
b_element_op_
;
ck
::
tensor_operation
::
element_wise
::
PassThrough
c_element_op_
;
};
// Invoker
struct
Invoker
:
public
BaseInvoker
{
using
Argument
=
DeviceGemmDl
::
Argument
;
float
Run
(
const
Argument
&
arg
,
const
StreamConfig
&
stream_config
=
StreamConfig
{})
{
{
std
::
cout
<<
"arg.a_grid_desc_k0_m0_m1_k1_{"
<<
arg
.
a_grid_desc_k0_m_k1_
.
GetLength
(
I0
)
<<
", "
<<
arg
.
a_grid_desc_k0_m_k1_
.
GetLength
(
I1
)
<<
", "
<<
arg
.
a_grid_desc_k0_m_k1_
.
GetLength
(
I2
)
<<
"}"
<<
std
::
endl
;
std
::
cout
<<
"arg.b_grid_desc_k0_n0_n1_k1_{"
<<
arg
.
b_grid_desc_k0_n_k1_
.
GetLength
(
I0
)
<<
", "
<<
arg
.
b_grid_desc_k0_n_k1_
.
GetLength
(
I1
)
<<
", "
<<
arg
.
b_grid_desc_k0_n_k1_
.
GetLength
(
I2
)
<<
"}"
<<
std
::
endl
;
std
::
cout
<<
"arg.c_grid_desc_m_n_{ "
<<
arg
.
c_grid_desc_m_n_
.
GetLength
(
I0
)
<<
", "
<<
arg
.
c_grid_desc_m_n_
.
GetLength
(
I1
)
<<
"}"
<<
std
::
endl
;
}
if
(
!
GridwiseGemm
::
CheckValidity
(
arg
.
a_grid_desc_k0_m_k1_
,
arg
.
b_grid_desc_k0_n_k1_
,
arg
.
c_grid_desc_m_n_
))
{
throw
std
::
runtime_error
(
"wrong! GridwiseGemm_k0mk1_k0nk1_mn_xdl_v2r3 has invalid setting"
);
}
const
index_t
grid_size
=
GridwiseGemm
::
CalculateGridSize
(
arg
.
c_grid_desc_m_n_
.
GetLength
(
I0
),
arg
.
c_grid_desc_m_n_
.
GetLength
(
I1
));
const
auto
K0
=
arg
.
a_grid_desc_k0_m0_m1_k1_
.
GetLength
(
I0
);
const
bool
has_main_k_block_loop
=
GridwiseGemm
::
CalculateHasMainKBlockLoop
(
K0
);
const
bool
has_double_tail_k_block_loop
=
GridwiseGemm
::
CalculateHasDoubleTailKBlockLoop
(
K0
);
float
ave_time
=
0
;
if
(
has_main_k_block_loop
&&
has_double_tail_k_block_loop
)
{
const
auto
kernel
=
kernel_gemm_dl_v1r3
<
GridwiseGemm
,
ck
::
half_t
,
ck
::
half_t
,
remove_reference_t
<
AGridDesc_K0_M0_M1_K1
>
,
remove_reference_t
<
BGridDesc_K0_N0_N1_K1
>
,
remove_reference_t
<
CGridDesc_M0_M10_M11_N0_N10_N11
>
,
remove_reference_t
<
DefaultBlock2CTileMap
>
,
true
,
true
>
;
ave_time
=
launch_and_time_kernel
(
stream_config
,
kernel
,
dim3
(
grid_size
),
dim3
(
256
),
0
,
arg
.
p_a_grid_
,
arg
.
p_b_grid_
,
arg
.
p_c_grid_
,
arg
.
a_grid_desc_k0_m0_m1_k1_
,
arg
.
b_grid_desc_k0_n0_n1_k1_
,
arg
.
c_grid_desc_m0_m10_m11_n0_n10_n11_
,
arg
.
block_2_ctile_map_
);
}
else
if
(
has_main_k_block_loop
&&
!
has_double_tail_k_block_loop
)
{
const
auto
kernel
=
kernel_gemm_dl_v1r3
<
GridwiseGemm
,
ck
::
half_t
,
ck
::
half_t
,
remove_reference_t
<
AGridDesc_K0_M0_M1_K1
>
,
remove_reference_t
<
BGridDesc_K0_N0_N1_K1
>
,
remove_reference_t
<
CGridDesc_M0_M10_M11_N0_N10_N11
>
,
remove_reference_t
<
DefaultBlock2CTileMap
>
,
true
,
false
>
;
ave_time
=
launch_and_time_kernel
(
stream_config
,
kernel
,
dim3
(
grid_size
),
dim3
(
256
),
0
,
arg
.
p_a_grid_
,
arg
.
p_b_grid_
,
arg
.
p_c_grid_
,
arg
.
a_grid_desc_k0_m0_m1_k1_
,
arg
.
b_grid_desc_k0_n0_n1_k1_
,
arg
.
c_grid_desc_m0_m10_m11_n0_n10_n11_
,
arg
.
block_2_ctile_map_
);
}
else
if
(
!
has_main_k_block_loop
&&
has_double_tail_k_block_loop
)
{
const
auto
kernel
=
kernel_gemm_dl_v1r3
<
GridwiseGemm
,
ck
::
half_t
,
ck
::
half_t
,
remove_reference_t
<
AGridDesc_K0_M0_M1_K1
>
,
remove_reference_t
<
BGridDesc_K0_N0_N1_K1
>
,
remove_reference_t
<
CGridDesc_M0_M10_M11_N0_N10_N11
>
,
remove_reference_t
<
DefaultBlock2CTileMap
>
,
false
,
true
>
;
ave_time
=
launch_and_time_kernel
(
stream_config
,
kernel
,
dim3
(
grid_size
),
dim3
(
256
),
0
,
arg
.
p_a_grid_
,
arg
.
p_b_grid_
,
arg
.
p_c_grid_
,
arg
.
a_grid_desc_k0_m0_m1_k1_
,
arg
.
b_grid_desc_k0_n0_n1_k1_
,
arg
.
c_grid_desc_m0_m10_m11_n0_n10_n11_
,
arg
.
block_2_ctile_map_
);
}
else
{
const
auto
kernel
=
kernel_gemm_dl_v1r3
<
GridwiseGemm
,
ck
::
half_t
,
ck
::
half_t
,
remove_reference_t
<
AGridDesc_K0_M0_M1_K1
>
,
remove_reference_t
<
BGridDesc_K0_N0_N1_K1
>
,
remove_reference_t
<
CGridDesc_M0_M10_M11_N0_N10_N11
>
,
remove_reference_t
<
DefaultBlock2CTileMap
>
,
false
,
false
>
;
ave_time
=
launch_and_time_kernel
(
stream_config
,
kernel
,
dim3
(
grid_size
),
dim3
(
256
),
0
,
arg
.
p_a_grid_
,
arg
.
p_b_grid_
,
arg
.
p_c_grid_
,
arg
.
a_grid_desc_k0_m0_m1_k1_
,
arg
.
b_grid_desc_k0_n0_n1_k1_
,
arg
.
c_grid_desc_m0_m10_m11_n0_n10_n11_
,
arg
.
block_2_ctile_map_
);
}
return
ave_time
;
}
// polymorphic
float
Run
(
const
BaseArgument
*
p_arg
,
const
StreamConfig
&
stream_config
=
StreamConfig
{})
override
{
return
Run
(
*
dynamic_cast
<
const
Argument
*>
(
p_arg
),
stream_config
);
}
};
static
constexpr
bool
IsValidCompilationParameter
()
{
// TODO: properly implement this check
return
true
;
}
static
bool
IsSupportedArgument
(
const
Argument
&
arg
)
{
if
(
ck
::
get_device_name
()
==
"gfx906"
||
ck
::
get_device_name
()
==
"gfx1030"
)
{
return
GridwiseGemm
::
CheckValidity
(
arg
.
a_grid_desc_k0_m_k1_
,
arg
.
b_grid_desc_k0_n_k1_
,
arg
.
c_grid_desc_m_n_
);
}
else
{
return
false
;
}
}
// polymorphic
bool
IsSupportedArgument
(
const
BaseArgument
*
p_arg
)
override
{
return
IsSupportedArgument
(
*
dynamic_cast
<
const
Argument
*>
(
p_arg
));
}
static
auto
MakeArgument
(
const
ck
::
half_t
*
p_a
,
const
ck
::
half_t
*
p_b
,
ck
::
half_t
*
p_c
,
index_t
M
,
index_t
N
,
index_t
K
,
index_t
StrideA
,
index_t
StrideB
,
index_t
StrideC
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
a_element_op
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
b_element_op
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
c_element_op
)
{
return
Argument
{
p_a
,
p_b
,
p_c
,
M
,
N
,
K
,
StrideA
,
StrideB
,
StrideC
,
1
,
1
,
a_element_op
,
b_element_op
,
c_element_op
};
}
static
auto
MakeInvoker
()
{
return
Invoker
{};
}
// polymorphic
std
::
unique_ptr
<
BaseArgument
>
MakeArgumentPointer
(
const
void
*
p_a
,
const
void
*
p_b
,
void
*
p_c
,
index_t
M
,
index_t
N
,
index_t
K
,
index_t
StrideA
,
index_t
StrideB
,
index_t
StrideC
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
a_element_op
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
b_element_op
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
c_element_op
)
override
{
return
std
::
make_unique
<
Argument
>
(
static_cast
<
const
ck
::
half_t
*>
(
p_a
),
static_cast
<
const
ck
::
half_t
*>
(
p_b
),
static_cast
<
ck
::
half_t
*>
(
p_c
),
M
,
N
,
K
,
StrideA
,
StrideB
,
StrideC
,
1
,
1
,
a_element_op
,
b_element_op
,
c_element_op
);
}
// polymorphic
std
::
unique_ptr
<
BaseInvoker
>
MakeInvokerPointer
()
override
{
return
std
::
make_unique
<
Invoker
>
(
Invoker
{});
}
// polymorphic
std
::
string
GetTypeString
()
const
override
{
auto
str
=
std
::
stringstream
();
// clang-format off
str
<<
"DeviceGemmDl"
<<
"<"
<<
256
<<
", "
<<
128
<<
", "
<<
128
<<
", "
<<
16
<<
", "
<<
2
<<
", "
<<
4
<<
", "
<<
4
<<
", "
<<
1
<<
">"
;
// clang-format on
return
str
.
str
();
}
};
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
\ No newline at end of file
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment