Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
f945f40a
Commit
f945f40a
authored
Apr 20, 2023
by
Astha Rai
Browse files
gemm example: compiles into normal executable and .so
parent
3af15212
Changes
19
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
19 changed files
with
1553 additions
and
1774 deletions
+1553
-1774
python/ait_impl/generation/Makefile
python/ait_impl/generation/Makefile
+2
-2
python/ait_impl/generation/demo.py
python/ait_impl/generation/demo.py
+0
-19
python/ait_impl/generation/ex/Makefile
python/ait_impl/generation/ex/Makefile
+16
-0
python/ait_impl/generation/ex/ex.cpp
python/ait_impl/generation/ex/ex.cpp
+183
-0
python/ait_impl/generation/ex/gemm_ex.py
python/ait_impl/generation/ex/gemm_ex.py
+293
-0
python/ait_impl/generation/ex/normal/Makefile
python/ait_impl/generation/ex/normal/Makefile
+14
-0
python/ait_impl/generation/ex/normal/gemm_ex.py
python/ait_impl/generation/ex/normal/gemm_ex.py
+293
-0
python/ait_impl/generation/ex/normal/gemm_ex_code.py
python/ait_impl/generation/ex/normal/gemm_ex_code.py
+215
-0
python/ait_impl/generation/ex/normal/input.py
python/ait_impl/generation/ex/normal/input.py
+1
-0
python/ait_impl/generation/ex/potential/Makefile
python/ait_impl/generation/ex/potential/Makefile
+13
-0
python/ait_impl/generation/ex/shared/Makefile
python/ait_impl/generation/ex/shared/Makefile
+14
-0
python/ait_impl/generation/ex/shared/gemm_ex.py
python/ait_impl/generation/ex/shared/gemm_ex.py
+293
-0
python/ait_impl/generation/ex/shared/gemm_ex_code.py
python/ait_impl/generation/ex/shared/gemm_ex_code.py
+215
-0
python/ait_impl/generation/ex/shared/input.py
python/ait_impl/generation/ex/shared/input.py
+1
-0
python/ait_impl/generation/gemm_dev_op.py
python/ait_impl/generation/gemm_dev_op.py
+0
-665
python/ait_impl/generation/gemm_kernel.py
python/ait_impl/generation/gemm_kernel.py
+0
-175
python/ait_impl/generation/norm_ex.py
python/ait_impl/generation/norm_ex.py
+0
-282
python/ait_impl/generation/permute_ex.py
python/ait_impl/generation/permute_ex.py
+0
-43
python/ait_impl/generation/xx.cpp
python/ait_impl/generation/xx.cpp
+0
-588
No files found.
python/ait_impl/generation/Makefile
View file @
f945f40a
gemm
:
xx.o
CFLAGS
=
-I
~/
rocm
/composable_kernel/include
-I
/opt/rocm-5.1.1/hip/include
-I
~/
rocm
/composable_kernel/include/
-I
~/
rocm
/composable_kernel/include/ck/
-I
~/
rocm
/composable_kernel/include/ck/problem_transform/
-I
~/
rocm
/composable_kernel/include/ck/tensor/
-I
~/
rocm
/composable_kernel/include/ck/tensor_description/
-I
~/
rocm
/composable_kernel/include/ck/tensor_operation/
-I
~/
rocm
/composable_kernel/include/ck/tensor_operation/gpu/block/
-I
~/
rocm
/composable_kernel/include/ck/tensor_operation/gpu/device/
-I
~/
rocm
/composable_kernel/include/ck/tensor_operation/gpu/device/impl/
-I
~/
rocm
/composable_kernel/include/ck/tensor_operation/gpu/element/
-I
~/
rocm
/composable_kernel/include/ck/tensor_operation/gpu/grid/
-I
~/
rocm
/composable_kernel/include/ck/tensor_operation/gpu/thread/
-I
~/
rocm
/composable_kernel/include/ck/tensor_operation/gpu/warp/
-I
~/
rocm
/composable_kernel/include/ck/host_utility
-I
/external/include/half/
-I
~/
rocm
/composable_kernel/library/include/ck/library/host/
-I
~/
rocm
/composable_kernel/library/include/ck/library/host_tensor/
-I
~/
rocm
/composable_kernel/library/include/ck/library/obselete_driver_offline/
-I
~/
rocm
/composable_kernel/library/include/ck/library/reference_tensor_operation/cpu/
-I
~/
rocm
/composable_kernel/library/include/ck/library/reference_tensor_operation/gpu/
-I
~/
rocm
/composable_kernel/library/include/ck/library/tensor_operation_instance/
-I
~/
rocm
/composable_kernel/library/include/ck/library/tensor_operation_instance/gpu/
" + "
reduce/
-I
~/
rocm
/composable_kernel/library/include/ck/library/tensor_op/
-I
~/
rocm
/composable_kernel/library/include/ck/library/utility/
-I
~/
rocm
/composable_kernel/profiler/include/
CFLAGS
=
-I
~/
workspace
/composable_kernel/include
-I
/opt/
workspace/
rocm-5.1.1/hip/include
-I
~/
workspace
/composable_kernel/include/
-I
~/
workspace
/composable_kernel/include/ck/
-I
~/
workspace
/composable_kernel/include/ck/problem_transform/
-I
~/
workspace
/composable_kernel/include/ck/tensor/
-I
~/
workspace
/composable_kernel/include/ck/tensor_description/
-I
~/
workspace
/composable_kernel/include/ck/tensor_operation/
-I
~/
workspace
/composable_kernel/include/ck/tensor_operation/gpu/block/
-I
~/
workspace
/composable_kernel/include/ck/tensor_operation/gpu/device/
-I
~/
workspace
/composable_kernel/include/ck/tensor_operation/gpu/device/impl/
-I
~/
workspace
/composable_kernel/include/ck/tensor_operation/gpu/element/
-I
~/
workspace
/composable_kernel/include/ck/tensor_operation/gpu/grid/
-I
~/
workspace
/composable_kernel/include/ck/tensor_operation/gpu/thread/
-I
~/
workspace
/composable_kernel/include/ck/tensor_operation/gpu/warp/
-I
~/
workspace
/composable_kernel/include/ck/host_utility
-I
/external/include/half/
-I
~/
workspace
/composable_kernel/library/include/ck/library/host/
-I
~/
workspace
/composable_kernel/library/include/ck/library/host_tensor/
-I
~/
workspace
/composable_kernel/library/include/ck/library/obselete_driver_offline/
-I
~/
workspace
/composable_kernel/library/include/ck/library/reference_tensor_operation/cpu/
-I
~/
workspace
/composable_kernel/library/include/ck/library/reference_tensor_operation/gpu/
-I
~/
workspace
/composable_kernel/library/include/ck/library/tensor_operation_instance/
-I
~/
workspace
/composable_kernel/library/include/ck/library/tensor_operation_instance/gpu/
" + "
reduce/
-I
~/
workspace
/composable_kernel/library/include/ck/library/tensor_op/
-I
~/
workspace
/composable_kernel/library/include/ck/library/utility/
-I
~/
workspace
/composable_kernel/profiler/include/
CXXFLAGS
=
-std
=
c++17
xx.o
:
hipcc
-fPIC
-fvisibility
=
hidden
$(CXXFLAGS)
-w
/opt/rocm/amdgcn/bitcode/oclc_abi_version_400.bc
$(CFLAGS)
-L
/opt/rocm/rocrand
-lrocrand
-x
hip
-c
-o
xx.cpp
hipcc
-fPIC
-fvisibility
=
hidden
$(CXXFLAGS)
-w
/opt/rocm
-5.3.0
/amdgcn/bitcode/oclc_abi_version_400.bc
$(CFLAGS)
-L
/opt/rocm
-5.3.0
/rocrand
-lrocrand
-x
hip
-c
xx.cpp
python/ait_impl/generation/demo.py
deleted
100644 → 0
View file @
3af15212
import
jinja2
SHAPE_EVAL_TEMPLATE
=
jinja2
.
Template
(
"""
int M = *in_{{ range(rank - 1)|join(' * *in_') }};
int N = *in_{{rank - 1}};
static constexpr auto I0 = Number<0>{};
static constexpr auto I1 = Number<1>{};
static constexpr auto I2 = Number<2>{};
static constexpr auto I3 = Number<3>{};
static constexpr auto I4 = Number<4>{};
static constexpr auto I5 = Number<5>{};
static constexpr auto K1Number = Number<K1>{};
"""
)
output
=
SHAPE_EVAL_TEMPLATE
.
render
(
rank
=
2
);
print
(
output
)
\ No newline at end of file
python/ait_impl/generation/ex/Makefile
0 → 100644
View file @
f945f40a
CFLAGS
=
-I
~/workspace/composable_kernel/include
-I
/opt/workspace/rocm-5.1.1/hip/include
-I
~/workspace/composable_kernel/include/
-I
~/workspace/composable_kernel/include/ck/
-I
~/workspace/composable_kernel/example/01_gemm/
-I
~/workspace/composable_kernel/library/include/
-I
~/workspace/composable_kernel/library/src/utility/
-I
~/workspace/composable_kernel/include/ck/problem_transform/
-I
~/workspace/composable_kernel/include/ck/tensor/
-I
~/workspace/composable_kernel/include/ck/tensor_description/
-I
~/workspace/composable_kernel/include/ck/tensor_operation/
-I
~/workspace/composable_kernel/include/ck/tensor_operation/gpu/block/
-I
~/workspace/composable_kernel/include/ck/tensor_operation/gpu/device/
-I
~/workspace/composable_kernel/include/ck/tensor_operation/gpu/device/impl/
-I
~/workspace/composable_kernel/include/ck/tensor_operation/gpu/element/
-I
~/workspace/composable_kernel/include/ck/tensor_operation/gpu/grid/
-I
~/workspace/composable_kernel/include/ck/tensor_operation/gpu/thread/
-I
~/workspace/composable_kernel/include/ck/tensor_operation/gpu/warp/
-I
~/workspace/composable_kernel/include/ck/host_utility
-I
/external/include/half/
-I
~/workspace/composable_kernel/library/include/ck/library/host/
-I
~/workspace/composable_kernel/library/include/ck/library/host_tensor/
-I
~/workspace/composable_kernel/library/include/ck/library/obselete_driver_offline/
-I
~/workspace/composable_kernel/library/include/ck/library/reference_tensor_operation/cpu/
-I
~/workspace/composable_kernel/library/include/ck/library/reference_tensor_operation/gpu/
-I
~/workspace/composable_kernel/library/include/ck/library/tensor_operation_instance/
-I
~/workspace/composable_kernel/library/include/ck/library/tensor_operation_instance/gpu/
" + "
reduce/
-I
~/workspace/composable_kernel/library/include/ck/library/tensor_op/
-I
~/workspace/composable_kernel/library/include/ck/library/utility/
-I
~/workspace/composable_kernel/profiler/include/
CXXFLAGS
=
-std
=
c++17
gemm
:
ex.o host_tensor.o device_memory.o
hipcc
$(CXXFLAGS)
$(CFLAGS)
ex.o host_tensor.o device_memory.o
-o
gemm
device_memory.o
:
../../../../library/src/utility/device_memory.cpp
hipcc
$(CXXFLAGS)
$(CFLAGS)
-c
../../../../library/src/utility/device_memory.cpp
host_tensor.o
:
../../../../library/src/utility/host_tensor.cpp
hipcc
$(CXXFLAGS)
$(CFLAGS)
-c
../../../../library/src/utility/host_tensor.cpp
ex.o
:
hipcc
-fPIC
-fvisibility
=
hidden
$(CXXFLAGS)
-w
/opt/rocm-5.3.0/amdgcn/bitcode/oclc_abi_version_400.bc
$(CFLAGS)
-L
/opt/rocm-5.3.0/rocrand
-lrocrand
-x
hip
-c
ex.cpp
\ No newline at end of file
python/ait_impl/generation/ex/ex.cpp
0 → 100644
View file @
f945f40a
#pragma once
#include "common.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_gemm_dl.hpp"
using
ADataType
=
ck
::
half_t
;
using
BDataType
=
ck
::
half_t
;
using
CDataType
=
ck
::
half_t
;
using
AccDataType
=
float
;
using
ALayout
=
Col
;
using
BLayout
=
Row
;
using
CLayout
=
Row
;
using
AElementOp
=
PassThrough
;
using
BElementOp
=
PassThrough
;
using
CElementOp
=
PassThrough
;
static
constexpr
auto
GemmDefault
=
ck
::
tensor_operation
::
device
::
GemmSpecialization
::
Default
;
using
DeviceGemmInstance
=
ck
::
tensor_operation
::
device
::
DeviceGemmDl
<
ck
::
half_t
,
ck
::
half_t
,
ck
::
half_t
,
float
,
ck
::
tensor_layout
::
gemm
::
ColumnMajor
,
ck
::
tensor_layout
::
gemm
::
RowMajor
,
ck
::
tensor_layout
::
gemm
::
RowMajor
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
,
ck
::
tensor_operation
::
device
::
GemmSpecialization
::
Default
,
256
,
128
,
128
,
16
,
2
,
4
,
4
,
1
,
S
<
8
,
2
>
,
S
<
8
,
2
>
,
S
<
2
,
1
,
4
,
2
>
,
S
<
8
,
1
,
32
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
2
,
1
,
4
,
2
>
,
S
<
8
,
1
,
32
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
1
>
,
S
<
0
,
3
,
1
,
2
>
,
S
<
1
,
1
,
4
,
2
>
,
S
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
4
>
;
using
ReferenceGemmInstance
=
ck
::
tensor_operation
::
host
::
ReferenceGemm
<
ADataType
,
BDataType
,
CDataType
,
AccDataType
,
AElementOp
,
BElementOp
,
CElementOp
>
;
bool
run_gemm
(
const
ProblemSize
&
problem_size
,
const
ExecutionConfig
&
config
)
{
using
namespace
ck
::
literals
;
auto
&
[
M
,
N
,
K
,
StrideA
,
StrideB
,
StrideC
]
=
problem_size
;
auto
f_host_tensor_descriptor
=
[](
std
::
size_t
row
,
std
::
size_t
col
,
std
::
size_t
stride
,
auto
layout
)
{
if
constexpr
(
std
::
is_same_v
<
decltype
(
layout
),
ck
::
tensor_layout
::
gemm
::
RowMajor
>
)
{
return
HostTensorDescriptor
({
row
,
col
},
{
stride
,
1
_uz
});
}
else
{
return
HostTensorDescriptor
({
row
,
col
},
{
1
_uz
,
stride
});
}
};
Tensor
<
ck
::
half_t
>
a_m_k
(
f_host_tensor_descriptor
(
M
,
K
,
StrideA
,
ck
::
tensor_layout
::
gemm
::
ColumnMajor
{}));
Tensor
<
ck
::
half_t
>
b_k_n
(
f_host_tensor_descriptor
(
K
,
N
,
StrideB
,
ck
::
tensor_layout
::
gemm
::
RowMajor
{}));
switch
(
config
.
init_method
)
{
case
0
:
break
;
case
1
:
ck
::
utils
::
FillUniformDistributionIntegerValue
<
ck
::
half_t
>
{
-
5.
f
,
5.
f
}(
a_m_k
);
ck
::
utils
::
FillUniformDistributionIntegerValue
<
ck
::
half_t
>
{
-
5.
f
,
5.
f
}(
b_k_n
);
break
;
default:
ck
::
utils
::
FillUniformDistribution
<
ck
::
half_t
>
{
-
1.
f
,
1.
f
}(
a_m_k
);
ck
::
utils
::
FillUniformDistribution
<
ck
::
half_t
>
{
-
1.
f
,
1.
f
}(
b_k_n
);
}
Tensor
<
ck
::
half_t
>
c_m_n_host_result
(
f_host_tensor_descriptor
(
M
,
N
,
StrideC
,
CLayout
{}));
Tensor
<
ck
::
half_t
>
c_m_n_device_result
(
f_host_tensor_descriptor
(
M
,
N
,
StrideC
,
CLayout
{}));
std
::
cout
<<
"a_m_k: "
<<
a_m_k
.
mDesc
<<
std
::
endl
;
std
::
cout
<<
"b_k_n: "
<<
b_k_n
.
mDesc
<<
std
::
endl
;
std
::
cout
<<
"c_m_n: "
<<
c_m_n_host_result
.
mDesc
<<
std
::
endl
;
DeviceMem
a_m_k_device_buf
(
sizeof
(
ck
::
half_t
)
*
a_m_k
.
mDesc
.
GetElementSpaceSize
());
DeviceMem
b_k_n_device_buf
(
sizeof
(
ck
::
half_t
)
*
b_k_n
.
mDesc
.
GetElementSpaceSize
());
DeviceMem
c_m_n_device_buf
(
sizeof
(
ck
::
half_t
)
*
c_m_n_device_result
.
mDesc
.
GetElementSpaceSize
());
a_m_k_device_buf
.
ToDevice
(
a_m_k
.
mData
.
data
());
b_k_n_device_buf
.
ToDevice
(
b_k_n
.
mData
.
data
());
auto
a_element_op
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
{};
auto
b_element_op
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
{};
auto
c_element_op
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
{};
// do GEMM
auto
gemm
=
DeviceGemmInstance
{};
auto
invoker
=
gemm
.
MakeInvoker
();
auto
argument
=
gemm
.
MakeArgument
(
static_cast
<
ck
::
half_t
*>
(
a_m_k_device_buf
.
GetDeviceBuffer
()),
static_cast
<
ck
::
half_t
*>
(
b_k_n_device_buf
.
GetDeviceBuffer
()),
static_cast
<
ck
::
half_t
*>
(
c_m_n_device_buf
.
GetDeviceBuffer
()),
M
,
N
,
K
,
StrideA
,
StrideB
,
StrideC
,
a_element_op
,
b_element_op
,
c_element_op
);
if
(
!
gemm
.
IsSupportedArgument
(
argument
))
{
std
::
cerr
<<
gemm
.
GetTypeString
()
<<
" does not support this problem"
<<
std
::
endl
;
return
true
;
}
float
ave_time
=
invoker
.
Run
(
argument
,
StreamConfig
{
nullptr
,
config
.
time_kernel
});
std
::
size_t
flop
=
2
_uz
*
M
*
N
*
K
;
std
::
size_t
num_btype
=
sizeof
(
ck
::
half_t
)
*
M
*
K
+
sizeof
(
ck
::
half_t
)
*
K
*
N
+
sizeof
(
ck
::
half_t
)
*
M
*
N
;
float
tflops
=
static_cast
<
float
>
(
flop
)
/
1.E9
/
ave_time
;
float
gb_per_sec
=
num_btype
/
1.E6
/
ave_time
;
std
::
cout
<<
"Perf: "
<<
ave_time
<<
" ms, "
<<
tflops
<<
" TFlops, "
<<
gb_per_sec
<<
" GB/s, "
<<
gemm
.
GetTypeString
()
<<
std
::
endl
;
if
(
config
.
do_verification
)
{
auto
ref_gemm
=
ReferenceGemmInstance
{};
auto
ref_invoker
=
ref_gemm
.
MakeInvoker
();
auto
ref_argument
=
ref_gemm
.
MakeArgument
(
a_m_k
,
b_k_n
,
c_m_n_host_result
,
a_element_op
,
b_element_op
,
c_element_op
);
ref_invoker
.
Run
(
ref_argument
);
c_m_n_device_buf
.
FromDevice
(
c_m_n_device_result
.
mData
.
data
());
return
ck
::
utils
::
check_err
(
c_m_n_device_result
,
c_m_n_host_result
);
}
return
true
;
}
bool
run_gemm_example
(
int
argc
,
char
*
argv
[])
{
ProblemSize
problem_size
;
ExecutionConfig
config
;
return
!
parse_cmd_args
(
argc
,
argv
,
problem_size
,
config
)
||
run_gemm
(
problem_size
,
config
);
}
int
main
(
int
argc
,
char
*
argv
[])
{
return
!
run_gemm_example
(
argc
,
argv
);
}
python/ait_impl/generation/ex/gemm_ex.py
0 → 100644
View file @
f945f40a
import
enum
import
os.path
import
shutil
import
functools
import
operator
import
collections
import
subprocess
import
re
def
SubstituteTemplate
(
template
,
values
):
text
=
template
changed
=
True
while
changed
:
changed
=
False
for
key
,
value
in
values
.
items
():
regex
=
"
\\
$
\\
{%s
\\
}"
%
key
newtext
=
re
.
sub
(
regex
,
value
,
text
)
if
newtext
!=
text
:
changed
=
True
text
=
newtext
return
text
class
EmitGemmInstance
:
def
__init__
(
self
):
self
.
make_template
=
"""
CFLAGS=-I ~/workspace/composable_kernel/include -I /opt/workspace/rocm-5.1.1/hip/include -I ~/workspace/composable_kernel/include/ -I ~/workspace/composable_kernel/include/ck/ -I ~/workspace/composable_kernel/example/01_gemm/ -I ~/workspace/composable_kernel/library/include/ -I ~/workspace/composable_kernel/library/src/utility/ -I ~/workspace/composable_kernel/include/ck/problem_transform/ -I ~/workspace/composable_kernel/include/ck/tensor/ -I ~/workspace/composable_kernel/include/ck/tensor_description/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/block/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/device/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/device/impl/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/element/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/grid/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/thread/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/warp/ -I ~/workspace/composable_kernel/include/ck/host_utility -I /external/include/half/ -I ~/workspace/composable_kernel/library/include/ck/library/host/ -I ~/workspace/composable_kernel/library/include/ck/library/host_tensor/ -I ~/workspace/composable_kernel/library/include/ck/library/obselete_driver_offline/ -I ~/workspace/composable_kernel/library/include/ck/library/reference_tensor_operation/cpu/ -I ~/workspace/composable_kernel/library/include/ck/library/reference_tensor_operation/gpu/ -I ~/workspace/composable_kernel/library/include/ck/library/tensor_operation_instance/ -I ~/workspace/composable_kernel/library/include/ck/library/tensor_operation_instance/gpu/" + "reduce/ -I ~/workspace/composable_kernel/library/include/ck/library/tensor_op/ -I ~/workspace/composable_kernel/library/include/ck/library/utility/ -I ~/workspace/composable_kernel/profiler/include/
CXXFLAGS = -std=c++17
gemm: ex.o host_tensor.o device_memory.o
hipcc $(CXXFLAGS) $(CFLAGS) ex.o host_tensor.o device_memory.o -o gemm
device_memory.o: ../../../../library/src/utility/device_memory.cpp
hipcc $(CXXFLAGS) $(CFLAGS) -c ../../../../library/src/utility/device_memory.cpp
host_tensor.o: ../../../../library/src/utility/host_tensor.cpp
hipcc $(CXXFLAGS) $(CFLAGS) -c ../../../../library/src/utility/host_tensor.cpp
ex.o:
hipcc -fPIC -fvisibility=hidden $(CXXFLAGS) -w /opt/rocm-5.3.0/amdgcn/bitcode/oclc_abi_version_400.bc $(CFLAGS) -L/opt/rocm-5.3.0/rocrand -lrocrand -x hip -c ex.cpp
"""
self
.
gemm_devop_template
=
"""
#pragma once
#include "common.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_gemm_dl.hpp"
using ADataType = ck::half_t;
using BDataType = ck::half_t;
using CDataType = ck::half_t;
using AccDataType = float;
using ALayout = Col;
using BLayout = Row;
using CLayout = Row;
using AElementOp = PassThrough;
using BElementOp = PassThrough;
using CElementOp = PassThrough;
static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmDl<
${type_a},
${type_b},
${type_c},
${type_acc},
${layout_a},
${layout_b},
${layout_c},
${elementwise_op_a},
${elementwise_op_b},
${elementwise_op_c},
${Gemm_spec},
${block_size},
${mperblock},
${nperblock},
${k0perblock},
${k1},
${m1perthread},
${n1perthread},
${kperthread},
${m1n1_thcluster_m1xs},
${m1n1_thcluster_n1xs},
${ABT_thread_slice_lengths_K0_M0_M1_K1},
${ABT_thread_cluster_lengths_K0_M0_M1_K1},
${ABT_thread_cluster_arrange_order},
${ABT_src_access_order},
${ABT_src_vec_tensor_lengths_K0_M0_M1_K1},
${ABT_src_vec_tensor_cont_dim_order},
${ABT_dst_vec_tensor_lengths_K0_M0_M1_K1},
${BBT_thread_slice_lengths_K0_N0_N1_K1},
${BBT_thread_cluster_lengths_K0_N0_N1_K1},
${BBT_thread_cluster_arrange_order},
${BBT_src_access_order},
${BBT_src_vec_tensor_lengths_K0_N0_N1_K1},
${BBT_src_vec_tensor_cont_dim_order},
${BBT_dst_vec_tensor_lengths_K0_N0_N1_K1},
${CTT_src_dst_access_order},
${CTT_src_dst_vec_dim},
${CTT_dst_scalar_per_vector}>;
using ReferenceGemmInstance = ck::tensor_operation::host::
ReferenceGemm<ADataType, BDataType, CDataType, AccDataType, AElementOp, BElementOp, CElementOp>;
bool run_gemm(const ProblemSize& problem_size, const ExecutionConfig& config)
{
using namespace ck::literals;
auto& [M, N, K, StrideA, StrideB, StrideC] = problem_size;
auto f_host_tensor_descriptor =
[](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
if constexpr(std::is_same_v<decltype(layout), ck::tensor_layout::gemm::RowMajor>)
{
return HostTensorDescriptor({row, col}, {stride, 1_uz});
}
else
{
return HostTensorDescriptor({row, col}, {1_uz, stride});
}
};
Tensor<${type_a}> a_m_k(f_host_tensor_descriptor(M, K, StrideA, ${layout_a}{}));
Tensor<${type_b}> b_k_n(f_host_tensor_descriptor(K, N, StrideB, ${layout_b}{}));
switch(config.init_method)
{
case 0: break;
case 1:
ck::utils::FillUniformDistributionIntegerValue<${type_a}>{-5.f, 5.f}(a_m_k);
ck::utils::FillUniformDistributionIntegerValue<${type_b}>{-5.f, 5.f}(b_k_n);
break;
default:
ck::utils::FillUniformDistribution<${type_a}>{-1.f, 1.f}(a_m_k);
ck::utils::FillUniformDistribution<${type_b}>{-1.f, 1.f}(b_k_n);
}
Tensor<${type_c}> c_m_n_host_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
Tensor<${type_c}> c_m_n_device_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
std::cout << "a_m_k: " << a_m_k.mDesc << std::endl;
std::cout << "b_k_n: " << b_k_n.mDesc << std::endl;
std::cout << "c_m_n: " << c_m_n_host_result.mDesc << std::endl;
DeviceMem a_m_k_device_buf(sizeof(${type_a}) * a_m_k.mDesc.GetElementSpaceSize());
DeviceMem b_k_n_device_buf(sizeof(${type_b}) * b_k_n.mDesc.GetElementSpaceSize());
DeviceMem c_m_n_device_buf(sizeof(${type_c}) * c_m_n_device_result.mDesc.GetElementSpaceSize());
a_m_k_device_buf.ToDevice(a_m_k.mData.data());
b_k_n_device_buf.ToDevice(b_k_n.mData.data());
auto a_element_op = ${elementwise_op_a}{};
auto b_element_op = ${elementwise_op_b}{};
auto c_element_op = ${elementwise_op_c}{};
// do GEMM
auto gemm = DeviceGemmInstance{};
auto invoker = gemm.MakeInvoker();
auto argument = gemm.MakeArgument(
static_cast<${type_a}*>(a_m_k_device_buf.GetDeviceBuffer()),
static_cast<${type_b}*>(b_k_n_device_buf.GetDeviceBuffer()),
static_cast<${type_c}*>(c_m_n_device_buf.GetDeviceBuffer()),
M,
N,
K,
StrideA,
StrideB,
StrideC,
a_element_op,
b_element_op,
c_element_op);
if(!gemm.IsSupportedArgument(argument))
{
std::cerr << gemm.GetTypeString() << " does not support this problem" << std::endl;
return true;
}
float ave_time = invoker.Run(argument, StreamConfig{nullptr, config.time_kernel});
std::size_t flop = 2_uz * M * N * K;
std::size_t num_btype =
sizeof(${type_a}) * M * K + sizeof(${type_b}) * K * N + sizeof(${type_c}) * M * N;
float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
float gb_per_sec = num_btype / 1.E6 / ave_time;
std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
<< gemm.GetTypeString() << std::endl;
if(config.do_verification)
{
auto ref_gemm = ReferenceGemmInstance{};
auto ref_invoker = ref_gemm.MakeInvoker();
auto ref_argument = ref_gemm.MakeArgument(
a_m_k, b_k_n, c_m_n_host_result, a_element_op, b_element_op, c_element_op);
ref_invoker.Run(ref_argument);
c_m_n_device_buf.FromDevice(c_m_n_device_result.mData.data());
return ck::utils::check_err(c_m_n_device_result, c_m_n_host_result);
}
return true;
}
bool run_gemm_example(int argc, char* argv[])
{
ProblemSize problem_size;
ExecutionConfig config;
return !parse_cmd_args(argc, argv, problem_size, config) || run_gemm(problem_size, config);
}
int main(int argc, char* argv[]) { return !run_gemm_example(argc, argv); }
"""
def
emit
(
self
):
values
=
{
'type_a'
:
'ck::half_t'
,
'type_b'
:
'ck::half_t'
,
'type_c'
:
'ck::half_t'
,
'type_acc'
:
'float'
,
'layout_a'
:
'ck::tensor_layout::gemm::ColumnMajor'
,
'layout_b'
:
'ck::tensor_layout::gemm::RowMajor'
,
'layout_c'
:
'ck::tensor_layout::gemm::RowMajor'
,
'elementwise_op_a'
:
'ck::tensor_operation::element_wise::PassThrough'
,
'elementwise_op_b'
:
'ck::tensor_operation::element_wise::PassThrough'
,
'elementwise_op_c'
:
'ck::tensor_operation::element_wise::PassThrough'
,
'Gemm_spec'
:
'ck::tensor_operation::device::GemmSpecialization::Default'
,
'block_size'
:
'256'
,
'mperblock'
:
'128'
,
'nperblock'
:
'128'
,
'k0perblock'
:
'16'
,
'k1'
:
'2'
,
'm1perthread'
:
'4'
,
'n1perthread'
:
'4'
,
'kperthread'
:
'1'
,
'm1n1_thcluster_m1xs'
:
'S<8, 2>'
,
'm1n1_thcluster_n1xs'
:
'S<8, 2>'
,
'ABT_thread_slice_lengths_K0_M0_M1_K1'
:
'S<2, 1, 4, 2>'
,
'ABT_thread_cluster_lengths_K0_M0_M1_K1'
:
'S<8, 1, 32, 1>'
,
'ABT_thread_cluster_arrange_order'
:
'S<0, 3, 1, 2>'
,
'ABT_src_access_order'
:
'S<0, 3, 1, 2>'
,
'ABT_src_vec_tensor_lengths_K0_M0_M1_K1'
:
'S<1, 1, 4, 1>'
,
'ABT_src_vec_tensor_cont_dim_order'
:
'S<0, 3, 1, 2>'
,
'ABT_dst_vec_tensor_lengths_K0_M0_M1_K1'
:
'S<1, 1, 4, 2>'
,
'BBT_thread_slice_lengths_K0_N0_N1_K1'
:
'S<2, 1, 4, 2>'
,
'BBT_thread_cluster_lengths_K0_N0_N1_K1'
:
'S<8, 1, 32, 1>'
,
'BBT_thread_cluster_arrange_order'
:
'S<0, 3, 1, 2>'
,
'BBT_src_access_order'
:
'S<0, 3, 1, 2>'
,
'BBT_src_vec_tensor_lengths_K0_N0_N1_K1'
:
'S<1, 1, 4, 1>'
,
'BBT_src_vec_tensor_cont_dim_order'
:
'S<0, 3, 1, 2>'
,
'BBT_dst_vec_tensor_lengths_K0_N0_N1_K1'
:
'S<1, 1, 4, 2>'
,
'CTT_src_dst_access_order'
:
'S<0, 1, 2, 3, 4, 5>'
,
'CTT_src_dst_vec_dim'
:
'5'
,
'CTT_dst_scalar_per_vector'
:
'4'
}
template
=
self
.
gemm_devop_template
cf
=
open
(
"ex.cpp"
,
'w'
)
print
(
SubstituteTemplate
(
template
,
values
))
cf
.
write
(
SubstituteTemplate
(
template
,
values
))
cf
.
close
()
m_template
=
self
.
make_template
cf
=
open
(
"Makefile"
,
'w'
)
print
(
SubstituteTemplate
(
m_template
,
values
))
cf
.
write
(
SubstituteTemplate
(
m_template
,
values
))
cf
.
close
()
PIPE
=
-
1
STDOUT
=
-
2
proc
=
subprocess
.
Popen
(
[
"make"
],
shell
=
True
,
env
=
os
.
environ
.
copy
(),
stdout
=
subprocess
.
PIPE
,
stderr
=
subprocess
.
PIPE
,
)
out
,
err
=
proc
.
communicate
()
a
=
EmitGemmInstance
()
a
.
emit
()
python/ait_impl/generation/ex/normal/Makefile
0 → 100644
View file @
f945f40a
CFLAGS
=
-I
~/workspace/composable_kernel/include
-I
/opt/workspace/rocm-5.1.1/hip/include
-I
~/workspace/composable_kernel/include/
-I
~/workspace/composable_kernel/include/ck/
-I
~/workspace/composable_kernel/example/01_gemm/
-I
~/workspace/composable_kernel/library/include/
-I
~/workspace/composable_kernel/library/src/utility/
-I
~/workspace/composable_kernel/include/ck/problem_transform/
-I
~/workspace/composable_kernel/include/ck/tensor/
-I
~/workspace/composable_kernel/include/ck/tensor_description/
-I
~/workspace/composable_kernel/include/ck/tensor_operation/
-I
~/workspace/composable_kernel/include/ck/tensor_operation/gpu/block/
-I
~/workspace/composable_kernel/include/ck/tensor_operation/gpu/device/
-I
~/workspace/composable_kernel/include/ck/tensor_operation/gpu/device/impl/
-I
~/workspace/composable_kernel/include/ck/tensor_operation/gpu/element/
-I
~/workspace/composable_kernel/include/ck/tensor_operation/gpu/grid/
-I
~/workspace/composable_kernel/include/ck/tensor_operation/gpu/thread/
-I
~/workspace/composable_kernel/include/ck/tensor_operation/gpu/warp/
-I
~/workspace/composable_kernel/include/ck/host_utility
-I
/external/include/half/
-I
~/workspace/composable_kernel/library/include/ck/library/host/
-I
~/workspace/composable_kernel/library/include/ck/library/host_tensor/
-I
~/workspace/composable_kernel/library/include/ck/library/obselete_driver_offline/
-I
~/workspace/composable_kernel/library/include/ck/library/reference_tensor_operation/cpu/
-I
~/workspace/composable_kernel/library/include/ck/library/reference_tensor_operation/gpu/
-I
~/workspace/composable_kernel/library/include/ck/library/tensor_operation_instance/
-I
~/workspace/composable_kernel/library/include/ck/library/tensor_operation_instance/gpu/
" + "
reduce/
-I
~/workspace/composable_kernel/library/include/ck/library/tensor_op/
-I
~/workspace/composable_kernel/library/include/ck/library/utility/
-I
~/workspace/composable_kernel/profiler/include/
CXXFLAGS
=
-std
=
c++17
gemm
:
ex.o host_tensor.o device_memory.o
hipcc
$(CXXFLAGS)
$(CFLAGS)
ex.o host_tensor.o device_memory.o
-o
gemm
device_memory.o
:
../../../../../library/src/utility/device_memory.cpp
hipcc
$(CXXFLAGS)
$(CFLAGS)
-c
../../../../../library/src/utility/device_memory.cpp
host_tensor.o
:
../../../../../library/src/utility/host_tensor.cpp
hipcc
$(CXXFLAGS)
$(CFLAGS)
-c
../../../../../library/src/utility/host_tensor.cpp
ex.o
:
hipcc
-fPIC
-fvisibility
=
hidden
$(CXXFLAGS)
-w
/opt/rocm-5.3.0/amdgcn/bitcode/oclc_abi_version_400.bc
$(CFLAGS)
-L
/opt/rocm-5.3.0/rocrand
-lrocrand
-x
hip
-c
ex.cpp
python/ait_impl/generation/ex/normal/gemm_ex.py
0 → 100644
View file @
f945f40a
import
enum
import
os.path
import
shutil
import
functools
import
operator
import
collections
import
subprocess
import
re
def
SubstituteTemplate
(
template
,
values
):
text
=
template
changed
=
True
while
changed
:
changed
=
False
for
key
,
value
in
values
.
items
():
regex
=
"
\\
$
\\
{%s
\\
}"
%
key
newtext
=
re
.
sub
(
regex
,
value
,
text
)
if
newtext
!=
text
:
changed
=
True
text
=
newtext
return
text
class
EmitGemmInstance
:
def
__init__
(
self
):
self
.
make_template
=
"""
CFLAGS=-I ~/workspace/composable_kernel/include -I /opt/workspace/rocm-5.1.1/hip/include -I ~/workspace/composable_kernel/include/ -I ~/workspace/composable_kernel/include/ck/ -I ~/workspace/composable_kernel/example/01_gemm/ -I ~/workspace/composable_kernel/library/include/ -I ~/workspace/composable_kernel/library/src/utility/ -I ~/workspace/composable_kernel/include/ck/problem_transform/ -I ~/workspace/composable_kernel/include/ck/tensor/ -I ~/workspace/composable_kernel/include/ck/tensor_description/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/block/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/device/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/device/impl/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/element/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/grid/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/thread/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/warp/ -I ~/workspace/composable_kernel/include/ck/host_utility -I /external/include/half/ -I ~/workspace/composable_kernel/library/include/ck/library/host/ -I ~/workspace/composable_kernel/library/include/ck/library/host_tensor/ -I ~/workspace/composable_kernel/library/include/ck/library/obselete_driver_offline/ -I ~/workspace/composable_kernel/library/include/ck/library/reference_tensor_operation/cpu/ -I ~/workspace/composable_kernel/library/include/ck/library/reference_tensor_operation/gpu/ -I ~/workspace/composable_kernel/library/include/ck/library/tensor_operation_instance/ -I ~/workspace/composable_kernel/library/include/ck/library/tensor_operation_instance/gpu/" + "reduce/ -I ~/workspace/composable_kernel/library/include/ck/library/tensor_op/ -I ~/workspace/composable_kernel/library/include/ck/library/utility/ -I ~/workspace/composable_kernel/profiler/include/
CXXFLAGS = -std=c++17
gemm: ex.o host_tensor.o device_memory.o
hipcc $(CXXFLAGS) $(CFLAGS) ex.o host_tensor.o device_memory.o -o gemm
device_memory.o: ../../../../library/src/utility/device_memory.cpp
hipcc $(CXXFLAGS) $(CFLAGS) -c ../../../../library/src/utility/device_memory.cpp
host_tensor.o: ../../../../library/src/utility/host_tensor.cpp
hipcc $(CXXFLAGS) $(CFLAGS) -c ../../../../library/src/utility/host_tensor.cpp
ex.o:
hipcc -fPIC -fvisibility=hidden $(CXXFLAGS) -w /opt/rocm-5.3.0/amdgcn/bitcode/oclc_abi_version_400.bc $(CFLAGS) -L/opt/rocm-5.3.0/rocrand -lrocrand -x hip -c ex.cpp
"""
self
.
gemm_devop_template
=
"""
#pragma once
#include "common.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_gemm_dl.hpp"
using ADataType = ck::half_t;
using BDataType = ck::half_t;
using CDataType = ck::half_t;
using AccDataType = float;
using ALayout = Col;
using BLayout = Row;
using CLayout = Row;
using AElementOp = PassThrough;
using BElementOp = PassThrough;
using CElementOp = PassThrough;
static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmDl<
${type_a},
${type_b},
${type_c},
${type_acc},
${layout_a},
${layout_b},
${layout_c},
${elementwise_op_a},
${elementwise_op_b},
${elementwise_op_c},
${Gemm_spec},
${block_size},
${mperblock},
${nperblock},
${k0perblock},
${k1},
${m1perthread},
${n1perthread},
${kperthread},
${m1n1_thcluster_m1xs},
${m1n1_thcluster_n1xs},
${ABT_thread_slice_lengths_K0_M0_M1_K1},
${ABT_thread_cluster_lengths_K0_M0_M1_K1},
${ABT_thread_cluster_arrange_order},
${ABT_src_access_order},
${ABT_src_vec_tensor_lengths_K0_M0_M1_K1},
${ABT_src_vec_tensor_cont_dim_order},
${ABT_dst_vec_tensor_lengths_K0_M0_M1_K1},
${BBT_thread_slice_lengths_K0_N0_N1_K1},
${BBT_thread_cluster_lengths_K0_N0_N1_K1},
${BBT_thread_cluster_arrange_order},
${BBT_src_access_order},
${BBT_src_vec_tensor_lengths_K0_N0_N1_K1},
${BBT_src_vec_tensor_cont_dim_order},
${BBT_dst_vec_tensor_lengths_K0_N0_N1_K1},
${CTT_src_dst_access_order},
${CTT_src_dst_vec_dim},
${CTT_dst_scalar_per_vector}>;
using ReferenceGemmInstance = ck::tensor_operation::host::
ReferenceGemm<ADataType, BDataType, CDataType, AccDataType, AElementOp, BElementOp, CElementOp>;
bool run_gemm(const ProblemSize& problem_size, const ExecutionConfig& config)
{
using namespace ck::literals;
auto& [M, N, K, StrideA, StrideB, StrideC] = problem_size;
auto f_host_tensor_descriptor =
[](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
if constexpr(std::is_same_v<decltype(layout), ck::tensor_layout::gemm::RowMajor>)
{
return HostTensorDescriptor({row, col}, {stride, 1_uz});
}
else
{
return HostTensorDescriptor({row, col}, {1_uz, stride});
}
};
Tensor<${type_a}> a_m_k(f_host_tensor_descriptor(M, K, StrideA, ${layout_a}{}));
Tensor<${type_b}> b_k_n(f_host_tensor_descriptor(K, N, StrideB, ${layout_b}{}));
switch(config.init_method)
{
case 0: break;
case 1:
ck::utils::FillUniformDistributionIntegerValue<${type_a}>{-5.f, 5.f}(a_m_k);
ck::utils::FillUniformDistributionIntegerValue<${type_b}>{-5.f, 5.f}(b_k_n);
break;
default:
ck::utils::FillUniformDistribution<${type_a}>{-1.f, 1.f}(a_m_k);
ck::utils::FillUniformDistribution<${type_b}>{-1.f, 1.f}(b_k_n);
}
Tensor<${type_c}> c_m_n_host_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
Tensor<${type_c}> c_m_n_device_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
std::cout << "a_m_k: " << a_m_k.mDesc << std::endl;
std::cout << "b_k_n: " << b_k_n.mDesc << std::endl;
std::cout << "c_m_n: " << c_m_n_host_result.mDesc << std::endl;
DeviceMem a_m_k_device_buf(sizeof(${type_a}) * a_m_k.mDesc.GetElementSpaceSize());
DeviceMem b_k_n_device_buf(sizeof(${type_b}) * b_k_n.mDesc.GetElementSpaceSize());
DeviceMem c_m_n_device_buf(sizeof(${type_c}) * c_m_n_device_result.mDesc.GetElementSpaceSize());
a_m_k_device_buf.ToDevice(a_m_k.mData.data());
b_k_n_device_buf.ToDevice(b_k_n.mData.data());
auto a_element_op = ${elementwise_op_a}{};
auto b_element_op = ${elementwise_op_b}{};
auto c_element_op = ${elementwise_op_c}{};
// do GEMM
auto gemm = DeviceGemmInstance{};
auto invoker = gemm.MakeInvoker();
auto argument = gemm.MakeArgument(
static_cast<${type_a}*>(a_m_k_device_buf.GetDeviceBuffer()),
static_cast<${type_b}*>(b_k_n_device_buf.GetDeviceBuffer()),
static_cast<${type_c}*>(c_m_n_device_buf.GetDeviceBuffer()),
M,
N,
K,
StrideA,
StrideB,
StrideC,
a_element_op,
b_element_op,
c_element_op);
if(!gemm.IsSupportedArgument(argument))
{
std::cerr << gemm.GetTypeString() << " does not support this problem" << std::endl;
return true;
}
float ave_time = invoker.Run(argument, StreamConfig{nullptr, config.time_kernel});
std::size_t flop = 2_uz * M * N * K;
std::size_t num_btype =
sizeof(${type_a}) * M * K + sizeof(${type_b}) * K * N + sizeof(${type_c}) * M * N;
float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
float gb_per_sec = num_btype / 1.E6 / ave_time;
std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
<< gemm.GetTypeString() << std::endl;
if(config.do_verification)
{
auto ref_gemm = ReferenceGemmInstance{};
auto ref_invoker = ref_gemm.MakeInvoker();
auto ref_argument = ref_gemm.MakeArgument(
a_m_k, b_k_n, c_m_n_host_result, a_element_op, b_element_op, c_element_op);
ref_invoker.Run(ref_argument);
c_m_n_device_buf.FromDevice(c_m_n_device_result.mData.data());
return ck::utils::check_err(c_m_n_device_result, c_m_n_host_result);
}
return true;
}
bool run_gemm_example(int argc, char* argv[])
{
ProblemSize problem_size;
ExecutionConfig config;
return !parse_cmd_args(argc, argv, problem_size, config) || run_gemm(problem_size, config);
}
int main(int argc, char* argv[]) { return !run_gemm_example(argc, argv); }
"""
def
emit
(
self
):
values
=
{
'type_a'
:
'ck::half_t'
,
'type_b'
:
'ck::half_t'
,
'type_c'
:
'ck::half_t'
,
'type_acc'
:
'float'
,
'layout_a'
:
'ck::tensor_layout::gemm::ColumnMajor'
,
'layout_b'
:
'ck::tensor_layout::gemm::RowMajor'
,
'layout_c'
:
'ck::tensor_layout::gemm::RowMajor'
,
'elementwise_op_a'
:
'ck::tensor_operation::element_wise::PassThrough'
,
'elementwise_op_b'
:
'ck::tensor_operation::element_wise::PassThrough'
,
'elementwise_op_c'
:
'ck::tensor_operation::element_wise::PassThrough'
,
'Gemm_spec'
:
'ck::tensor_operation::device::GemmSpecialization::Default'
,
'block_size'
:
'256'
,
'mperblock'
:
'128'
,
'nperblock'
:
'128'
,
'k0perblock'
:
'16'
,
'k1'
:
'2'
,
'm1perthread'
:
'4'
,
'n1perthread'
:
'4'
,
'kperthread'
:
'1'
,
'm1n1_thcluster_m1xs'
:
'S<8, 2>'
,
'm1n1_thcluster_n1xs'
:
'S<8, 2>'
,
'ABT_thread_slice_lengths_K0_M0_M1_K1'
:
'S<2, 1, 4, 2>'
,
'ABT_thread_cluster_lengths_K0_M0_M1_K1'
:
'S<8, 1, 32, 1>'
,
'ABT_thread_cluster_arrange_order'
:
'S<0, 3, 1, 2>'
,
'ABT_src_access_order'
:
'S<0, 3, 1, 2>'
,
'ABT_src_vec_tensor_lengths_K0_M0_M1_K1'
:
'S<1, 1, 4, 1>'
,
'ABT_src_vec_tensor_cont_dim_order'
:
'S<0, 3, 1, 2>'
,
'ABT_dst_vec_tensor_lengths_K0_M0_M1_K1'
:
'S<1, 1, 4, 2>'
,
'BBT_thread_slice_lengths_K0_N0_N1_K1'
:
'S<2, 1, 4, 2>'
,
'BBT_thread_cluster_lengths_K0_N0_N1_K1'
:
'S<8, 1, 32, 1>'
,
'BBT_thread_cluster_arrange_order'
:
'S<0, 3, 1, 2>'
,
'BBT_src_access_order'
:
'S<0, 3, 1, 2>'
,
'BBT_src_vec_tensor_lengths_K0_N0_N1_K1'
:
'S<1, 1, 4, 1>'
,
'BBT_src_vec_tensor_cont_dim_order'
:
'S<0, 3, 1, 2>'
,
'BBT_dst_vec_tensor_lengths_K0_N0_N1_K1'
:
'S<1, 1, 4, 2>'
,
'CTT_src_dst_access_order'
:
'S<0, 1, 2, 3, 4, 5>'
,
'CTT_src_dst_vec_dim'
:
'5'
,
'CTT_dst_scalar_per_vector'
:
'4'
}
template
=
self
.
gemm_devop_template
cf
=
open
(
"ex.cpp"
,
'w'
)
print
(
SubstituteTemplate
(
template
,
values
))
cf
.
write
(
SubstituteTemplate
(
template
,
values
))
cf
.
close
()
m_template
=
self
.
make_template
cf
=
open
(
"Makefile"
,
'w'
)
print
(
SubstituteTemplate
(
m_template
,
values
))
cf
.
write
(
SubstituteTemplate
(
m_template
,
values
))
cf
.
close
()
PIPE
=
-
1
STDOUT
=
-
2
proc
=
subprocess
.
Popen
(
[
"make"
],
shell
=
True
,
env
=
os
.
environ
.
copy
(),
stdout
=
subprocess
.
PIPE
,
stderr
=
subprocess
.
PIPE
,
)
out
,
err
=
proc
.
communicate
()
a
=
EmitGemmInstance
()
a
.
emit
()
python/ait_impl/generation/ex/normal/gemm_ex_code.py
0 → 100644
View file @
f945f40a
import
enum
import
os.path
import
shutil
import
functools
import
operator
import
collections
import
re
def
SubstituteTemplate
(
template
,
values
):
text
=
template
changed
=
True
while
changed
:
changed
=
False
for
key
,
value
in
values
.
items
():
regex
=
"
\\
$
\\
{%s
\\
}"
%
key
newtext
=
re
.
sub
(
regex
,
value
,
text
)
if
newtext
!=
text
:
changed
=
True
text
=
newtext
return
text
class
EmitGemmInstance
:
def
__init__
(
self
):
self
.
gemm_devop_template
=
"""
#pragma once
#include "common.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_gemm_dl.hpp"
using ADataType = ck::half_t;
using BDataType = ck::half_t;
using CDataType = ck::half_t;
using AccDataType = float;
using ALayout = Col;
using BLayout = Row;
using CLayout = Row;
using AElementOp = PassThrough;
using BElementOp = PassThrough;
using CElementOp = PassThrough;
static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmDl
<ADataType,
BDataType,
CDataType,
AccDataType,
ALayout,
BLayout,
CLayout,
AElementOp,
BElementOp,
CElementOp,
GemmDefault,
256,
128,
128,
16,
2,
4,
4,
1,
S<8, 2>,
S<8, 2>,
S<2, 1, 4, 2>,
S<8, 1, 32, 1>,
S<0, 3, 1, 2>,
S<0, 3, 1, 2>,
S<1, 1, 4, 1>,
S<0, 3, 1, 2>,
S<1, 1, 4, 2>,
S<2, 1, 4, 2>,
S<8, 1, 32, 1>,
S<0, 3, 1, 2>,
S<0, 3, 1, 2>,
S<1, 1, 4, 1>,
S<0, 3, 1, 2>,
S<1, 1, 4, 2>,
S<0, 1, 2, 3, 4, 5>,
5,
4>;
bool run_gemm(const ProblemSize& problem_size, const ExecutionConfig& config)
{
using namespace ck::literals;
auto& [M, N, K, StrideA, StrideB, StrideC] = problem_size;
auto f_host_tensor_descriptor =
[](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
if constexpr(std::is_same_v<decltype(layout), ck::tensor_layout::gemm::RowMajor>)
{
return HostTensorDescriptor({row, col}, {stride, 1_uz});
}
else
{
return HostTensorDescriptor({row, col}, {1_uz, stride});
}
};
Tensor<ADataType> a_m_k(f_host_tensor_descriptor(M, K, StrideA, ALayout{}));
Tensor<BDataType> b_k_n(f_host_tensor_descriptor(K, N, StrideB, BLayout{}));
switch(config.init_method)
{
case 0: break;
case 1:
ck::utils::FillUniformDistributionIntegerValue<ADataType>{-5.f, 5.f}(a_m_k);
ck::utils::FillUniformDistributionIntegerValue<BDataType>{-5.f, 5.f}(b_k_n);
break;
default:
ck::utils::FillUniformDistribution<ADataType>{-1.f, 1.f}(a_m_k);
ck::utils::FillUniformDistribution<BDataType>{-1.f, 1.f}(b_k_n);
}
Tensor<CDataType> c_m_n_host_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
Tensor<CDataType> c_m_n_device_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
std::cout << "a_m_k: " << a_m_k.mDesc << std::endl;
std::cout << "b_k_n: " << b_k_n.mDesc << std::endl;
std::cout << "c_m_n: " << c_m_n_host_result.mDesc << std::endl;
DeviceMem a_m_k_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpaceSize());
DeviceMem b_k_n_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpaceSize());
DeviceMem c_m_n_device_buf(sizeof(CDataType) * c_m_n_device_result.mDesc.GetElementSpaceSize());
a_m_k_device_buf.ToDevice(a_m_k.mData.data());
b_k_n_device_buf.ToDevice(b_k_n.mData.data());
auto a_element_op = AElementOp{};
auto b_element_op = BElementOp{};
auto c_element_op = CElementOp{};
// do GEMM
auto gemm = DeviceGemmInstance{};
auto invoker = gemm.MakeInvoker();
auto argument = gemm.MakeArgument(
static_cast<ADataType*>(a_m_k_device_buf.GetDeviceBuffer()),
static_cast<BDataType*>(b_k_n_device_buf.GetDeviceBuffer()),
static_cast<CDataType*>(c_m_n_device_buf.GetDeviceBuffer()),
M,
N,
K,
StrideA,
StrideB,
StrideC,
a_element_op,
b_element_op,
c_element_op);
if(!gemm.IsSupportedArgument(argument))
{
std::cerr << gemm.GetTypeString() << " does not support this problem" << std::endl;
return true;
}
float ave_time = invoker.Run(argument, StreamConfig{nullptr, config.time_kernel});
std::size_t flop = 2_uz * M * N * K;
std::size_t num_btype =
sizeof(ADataType) * M * K + sizeof(BDataType) * K * N + sizeof(CDataType) * M * N;
float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
float gb_per_sec = num_btype / 1.E6 / ave_time;
std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
<< gemm.GetTypeString() << std::endl;
if(config.do_verification)
{
auto ref_gemm = ReferenceGemmInstance{};
auto ref_invoker = ref_gemm.MakeInvoker();
auto ref_argument = ref_gemm.MakeArgument(
a_m_k, b_k_n, c_m_n_host_result, a_element_op, b_element_op, c_element_op);
ref_invoker.Run(ref_argument);
c_m_n_device_buf.FromDevice(c_m_n_device_result.mData.data());
return ck::utils::check_err(c_m_n_device_result, c_m_n_host_result);
}
return true;
}
bool run_gemm_example(int argc, char* argv[])
{
ProblemSize problem_size;
ExecutionConfig config;
return !parse_cmd_args(argc, argv, problem_size, config) || run_gemm(problem_size, config);
}
"""
def
emit
(
self
):
values
=
{
'type_a'
:
'ck::half_t'
,
}
template
=
self
.
gemm_devop_template
cf
=
open
(
"xx.cpp"
,
'w'
)
print
(
SubstituteTemplate
(
template
,
values
))
cf
.
write
(
SubstituteTemplate
(
template
,
values
))
cf
.
close
()
a
=
EmitGemmInstance
()
a
.
emit
()
python/ait_impl/generation/ex/normal/input.py
0 → 100644
View file @
f945f40a
#take in input for gemm from user, send it to example template
\ No newline at end of file
python/ait_impl/generation/ex/potential/Makefile
0 → 100644
View file @
f945f40a
CC
=
/opt/rocm/bin/hipcc
CK_PATH
=
/dockerx/composable_kernel/
CFLAGS
=
-O3
-std
=
c++17
-DCK_AMD_GPU_GFX90A
--offload-arch
=
gfx90a
-I
"
${CK_PATH}
/include"
-I
"
${CK_PATH}
/library/include"
-I
"
${CK_PATH}
/profiler/include"
OBJS
=
ex.o host_tensor.o device_memory.o
all
:
$(OBJS)
$(CC)
$(CFLAGS)
$(OBJS)
-o
ex
device_memory.o
:
../../library/src/utility/device_memory.cpp
$(CC)
$(CFLAGS)
-c
../../library/src/utility/device_memory.cpp
host_tensor.o
:
../../library/src/utility/host_tensor.cpp
$(CC)
$(CFLAGS)
-c
../../library/src/utility/host_tensor.cpp
\ No newline at end of file
python/ait_impl/generation/ex/shared/Makefile
0 → 100644
View file @
f945f40a
CFLAGS
=
-I
~/workspace/composable_kernel/include
-I
/opt/workspace/rocm-5.1.1/hip/include
-I
~/workspace/composable_kernel/include/
-I
~/workspace/composable_kernel/include/ck/
-I
~/workspace/composable_kernel/example/01_gemm/
-I
~/workspace/composable_kernel/library/include/
-I
~/workspace/composable_kernel/library/src/utility/
-I
~/workspace/composable_kernel/include/ck/problem_transform/
-I
~/workspace/composable_kernel/include/ck/tensor/
-I
~/workspace/composable_kernel/include/ck/tensor_description/
-I
~/workspace/composable_kernel/include/ck/tensor_operation/
-I
~/workspace/composable_kernel/include/ck/tensor_operation/gpu/block/
-I
~/workspace/composable_kernel/include/ck/tensor_operation/gpu/device/
-I
~/workspace/composable_kernel/include/ck/tensor_operation/gpu/device/impl/
-I
~/workspace/composable_kernel/include/ck/tensor_operation/gpu/element/
-I
~/workspace/composable_kernel/include/ck/tensor_operation/gpu/grid/
-I
~/workspace/composable_kernel/include/ck/tensor_operation/gpu/thread/
-I
~/workspace/composable_kernel/include/ck/tensor_operation/gpu/warp/
-I
~/workspace/composable_kernel/include/ck/host_utility
-I
/external/include/half/
-I
~/workspace/composable_kernel/library/include/ck/library/host/
-I
~/workspace/composable_kernel/library/include/ck/library/host_tensor/
-I
~/workspace/composable_kernel/library/include/ck/library/obselete_driver_offline/
-I
~/workspace/composable_kernel/library/include/ck/library/reference_tensor_operation/cpu/
-I
~/workspace/composable_kernel/library/include/ck/library/reference_tensor_operation/gpu/
-I
~/workspace/composable_kernel/library/include/ck/library/tensor_operation_instance/
-I
~/workspace/composable_kernel/library/include/ck/library/tensor_operation_instance/gpu/
" + "
reduce/
-I
~/workspace/composable_kernel/library/include/ck/library/tensor_op/
-I
~/workspace/composable_kernel/library/include/ck/library/utility/
-I
~/workspace/composable_kernel/profiler/include/
CXXFLAGS
=
-std
=
c++17
test.so
:
ex.o host_tensor.o device_memory.o
hipcc
-shared
$(CXXFLAGS)
$(CFLAGS)
ex.o host_tensor.o device_memory.o
-o
test.so
device_memory.o
:
../../../../../library/src/utility/device_memory.cpp
hipcc
-fPIC
-fvisibility
=
hidden
$(CXXFLAGS)
$(CFLAGS)
-c
../../../../../library/src/utility/device_memory.cpp
host_tensor.o
:
../../../../../library/src/utility/host_tensor.cpp
hipcc
-fPIC
-fvisibility
=
hidden
$(CXXFLAGS)
$(CFLAGS)
-c
../../../../../library/src/utility/host_tensor.cpp
ex.o
:
hipcc
-fPIC
-fvisibility
=
hidden
$(CXXFLAGS)
-w
/opt/rocm-5.3.0/amdgcn/bitcode/oclc_abi_version_400.bc
$(CFLAGS)
-L
/opt/rocm-5.3.0/rocrand
-lrocrand
-x
hip
-c
ex.cpp
python/ait_impl/generation/ex/shared/gemm_ex.py
0 → 100644
View file @
f945f40a
import
enum
import
os.path
import
shutil
import
functools
import
operator
import
collections
import
subprocess
import
re
def
SubstituteTemplate
(
template
,
values
):
text
=
template
changed
=
True
while
changed
:
changed
=
False
for
key
,
value
in
values
.
items
():
regex
=
"
\\
$
\\
{%s
\\
}"
%
key
newtext
=
re
.
sub
(
regex
,
value
,
text
)
if
newtext
!=
text
:
changed
=
True
text
=
newtext
return
text
class
EmitGemmInstance
:
def
__init__
(
self
):
self
.
make_template
=
"""
CFLAGS=-I ~/workspace/composable_kernel/include -I /opt/workspace/rocm-5.1.1/hip/include -I ~/workspace/composable_kernel/include/ -I ~/workspace/composable_kernel/include/ck/ -I ~/workspace/composable_kernel/example/01_gemm/ -I ~/workspace/composable_kernel/library/include/ -I ~/workspace/composable_kernel/library/src/utility/ -I ~/workspace/composable_kernel/include/ck/problem_transform/ -I ~/workspace/composable_kernel/include/ck/tensor/ -I ~/workspace/composable_kernel/include/ck/tensor_description/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/block/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/device/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/device/impl/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/element/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/grid/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/thread/ -I ~/workspace/composable_kernel/include/ck/tensor_operation/gpu/warp/ -I ~/workspace/composable_kernel/include/ck/host_utility -I /external/include/half/ -I ~/workspace/composable_kernel/library/include/ck/library/host/ -I ~/workspace/composable_kernel/library/include/ck/library/host_tensor/ -I ~/workspace/composable_kernel/library/include/ck/library/obselete_driver_offline/ -I ~/workspace/composable_kernel/library/include/ck/library/reference_tensor_operation/cpu/ -I ~/workspace/composable_kernel/library/include/ck/library/reference_tensor_operation/gpu/ -I ~/workspace/composable_kernel/library/include/ck/library/tensor_operation_instance/ -I ~/workspace/composable_kernel/library/include/ck/library/tensor_operation_instance/gpu/" + "reduce/ -I ~/workspace/composable_kernel/library/include/ck/library/tensor_op/ -I ~/workspace/composable_kernel/library/include/ck/library/utility/ -I ~/workspace/composable_kernel/profiler/include/
CXXFLAGS = -std=c++17
gemm: ex.o host_tensor.o device_memory.o
hipcc $(CXXFLAGS) $(CFLAGS) ex.o host_tensor.o device_memory.o -o gemm
device_memory.o: ../../../../library/src/utility/device_memory.cpp
hipcc $(CXXFLAGS) $(CFLAGS) -c ../../../../library/src/utility/device_memory.cpp
host_tensor.o: ../../../../library/src/utility/host_tensor.cpp
hipcc $(CXXFLAGS) $(CFLAGS) -c ../../../../library/src/utility/host_tensor.cpp
ex.o:
hipcc -fPIC -fvisibility=hidden $(CXXFLAGS) -w /opt/rocm-5.3.0/amdgcn/bitcode/oclc_abi_version_400.bc $(CFLAGS) -L/opt/rocm-5.3.0/rocrand -lrocrand -x hip -c ex.cpp
"""
self
.
gemm_devop_template
=
"""
#pragma once
#include "common.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_gemm_dl.hpp"
using ADataType = ck::half_t;
using BDataType = ck::half_t;
using CDataType = ck::half_t;
using AccDataType = float;
using ALayout = Col;
using BLayout = Row;
using CLayout = Row;
using AElementOp = PassThrough;
using BElementOp = PassThrough;
using CElementOp = PassThrough;
static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmDl<
${type_a},
${type_b},
${type_c},
${type_acc},
${layout_a},
${layout_b},
${layout_c},
${elementwise_op_a},
${elementwise_op_b},
${elementwise_op_c},
${Gemm_spec},
${block_size},
${mperblock},
${nperblock},
${k0perblock},
${k1},
${m1perthread},
${n1perthread},
${kperthread},
${m1n1_thcluster_m1xs},
${m1n1_thcluster_n1xs},
${ABT_thread_slice_lengths_K0_M0_M1_K1},
${ABT_thread_cluster_lengths_K0_M0_M1_K1},
${ABT_thread_cluster_arrange_order},
${ABT_src_access_order},
${ABT_src_vec_tensor_lengths_K0_M0_M1_K1},
${ABT_src_vec_tensor_cont_dim_order},
${ABT_dst_vec_tensor_lengths_K0_M0_M1_K1},
${BBT_thread_slice_lengths_K0_N0_N1_K1},
${BBT_thread_cluster_lengths_K0_N0_N1_K1},
${BBT_thread_cluster_arrange_order},
${BBT_src_access_order},
${BBT_src_vec_tensor_lengths_K0_N0_N1_K1},
${BBT_src_vec_tensor_cont_dim_order},
${BBT_dst_vec_tensor_lengths_K0_N0_N1_K1},
${CTT_src_dst_access_order},
${CTT_src_dst_vec_dim},
${CTT_dst_scalar_per_vector}>;
using ReferenceGemmInstance = ck::tensor_operation::host::
ReferenceGemm<ADataType, BDataType, CDataType, AccDataType, AElementOp, BElementOp, CElementOp>;
bool run_gemm(const ProblemSize& problem_size, const ExecutionConfig& config)
{
using namespace ck::literals;
auto& [M, N, K, StrideA, StrideB, StrideC] = problem_size;
auto f_host_tensor_descriptor =
[](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
if constexpr(std::is_same_v<decltype(layout), ck::tensor_layout::gemm::RowMajor>)
{
return HostTensorDescriptor({row, col}, {stride, 1_uz});
}
else
{
return HostTensorDescriptor({row, col}, {1_uz, stride});
}
};
Tensor<${type_a}> a_m_k(f_host_tensor_descriptor(M, K, StrideA, ${layout_a}{}));
Tensor<${type_b}> b_k_n(f_host_tensor_descriptor(K, N, StrideB, ${layout_b}{}));
switch(config.init_method)
{
case 0: break;
case 1:
ck::utils::FillUniformDistributionIntegerValue<${type_a}>{-5.f, 5.f}(a_m_k);
ck::utils::FillUniformDistributionIntegerValue<${type_b}>{-5.f, 5.f}(b_k_n);
break;
default:
ck::utils::FillUniformDistribution<${type_a}>{-1.f, 1.f}(a_m_k);
ck::utils::FillUniformDistribution<${type_b}>{-1.f, 1.f}(b_k_n);
}
Tensor<${type_c}> c_m_n_host_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
Tensor<${type_c}> c_m_n_device_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
std::cout << "a_m_k: " << a_m_k.mDesc << std::endl;
std::cout << "b_k_n: " << b_k_n.mDesc << std::endl;
std::cout << "c_m_n: " << c_m_n_host_result.mDesc << std::endl;
DeviceMem a_m_k_device_buf(sizeof(${type_a}) * a_m_k.mDesc.GetElementSpaceSize());
DeviceMem b_k_n_device_buf(sizeof(${type_b}) * b_k_n.mDesc.GetElementSpaceSize());
DeviceMem c_m_n_device_buf(sizeof(${type_c}) * c_m_n_device_result.mDesc.GetElementSpaceSize());
a_m_k_device_buf.ToDevice(a_m_k.mData.data());
b_k_n_device_buf.ToDevice(b_k_n.mData.data());
auto a_element_op = ${elementwise_op_a}{};
auto b_element_op = ${elementwise_op_b}{};
auto c_element_op = ${elementwise_op_c}{};
// do GEMM
auto gemm = DeviceGemmInstance{};
auto invoker = gemm.MakeInvoker();
auto argument = gemm.MakeArgument(
static_cast<${type_a}*>(a_m_k_device_buf.GetDeviceBuffer()),
static_cast<${type_b}*>(b_k_n_device_buf.GetDeviceBuffer()),
static_cast<${type_c}*>(c_m_n_device_buf.GetDeviceBuffer()),
M,
N,
K,
StrideA,
StrideB,
StrideC,
a_element_op,
b_element_op,
c_element_op);
if(!gemm.IsSupportedArgument(argument))
{
std::cerr << gemm.GetTypeString() << " does not support this problem" << std::endl;
return true;
}
float ave_time = invoker.Run(argument, StreamConfig{nullptr, config.time_kernel});
std::size_t flop = 2_uz * M * N * K;
std::size_t num_btype =
sizeof(${type_a}) * M * K + sizeof(${type_b}) * K * N + sizeof(${type_c}) * M * N;
float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
float gb_per_sec = num_btype / 1.E6 / ave_time;
std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
<< gemm.GetTypeString() << std::endl;
if(config.do_verification)
{
auto ref_gemm = ReferenceGemmInstance{};
auto ref_invoker = ref_gemm.MakeInvoker();
auto ref_argument = ref_gemm.MakeArgument(
a_m_k, b_k_n, c_m_n_host_result, a_element_op, b_element_op, c_element_op);
ref_invoker.Run(ref_argument);
c_m_n_device_buf.FromDevice(c_m_n_device_result.mData.data());
return ck::utils::check_err(c_m_n_device_result, c_m_n_host_result);
}
return true;
}
bool run_gemm_example(int argc, char* argv[])
{
ProblemSize problem_size;
ExecutionConfig config;
return !parse_cmd_args(argc, argv, problem_size, config) || run_gemm(problem_size, config);
}
int main(int argc, char* argv[]) { return !run_gemm_example(argc, argv); }
"""
def
emit
(
self
):
values
=
{
'type_a'
:
'ck::half_t'
,
'type_b'
:
'ck::half_t'
,
'type_c'
:
'ck::half_t'
,
'type_acc'
:
'float'
,
'layout_a'
:
'ck::tensor_layout::gemm::ColumnMajor'
,
'layout_b'
:
'ck::tensor_layout::gemm::RowMajor'
,
'layout_c'
:
'ck::tensor_layout::gemm::RowMajor'
,
'elementwise_op_a'
:
'ck::tensor_operation::element_wise::PassThrough'
,
'elementwise_op_b'
:
'ck::tensor_operation::element_wise::PassThrough'
,
'elementwise_op_c'
:
'ck::tensor_operation::element_wise::PassThrough'
,
'Gemm_spec'
:
'ck::tensor_operation::device::GemmSpecialization::Default'
,
'block_size'
:
'256'
,
'mperblock'
:
'128'
,
'nperblock'
:
'128'
,
'k0perblock'
:
'16'
,
'k1'
:
'2'
,
'm1perthread'
:
'4'
,
'n1perthread'
:
'4'
,
'kperthread'
:
'1'
,
'm1n1_thcluster_m1xs'
:
'S<8, 2>'
,
'm1n1_thcluster_n1xs'
:
'S<8, 2>'
,
'ABT_thread_slice_lengths_K0_M0_M1_K1'
:
'S<2, 1, 4, 2>'
,
'ABT_thread_cluster_lengths_K0_M0_M1_K1'
:
'S<8, 1, 32, 1>'
,
'ABT_thread_cluster_arrange_order'
:
'S<0, 3, 1, 2>'
,
'ABT_src_access_order'
:
'S<0, 3, 1, 2>'
,
'ABT_src_vec_tensor_lengths_K0_M0_M1_K1'
:
'S<1, 1, 4, 1>'
,
'ABT_src_vec_tensor_cont_dim_order'
:
'S<0, 3, 1, 2>'
,
'ABT_dst_vec_tensor_lengths_K0_M0_M1_K1'
:
'S<1, 1, 4, 2>'
,
'BBT_thread_slice_lengths_K0_N0_N1_K1'
:
'S<2, 1, 4, 2>'
,
'BBT_thread_cluster_lengths_K0_N0_N1_K1'
:
'S<8, 1, 32, 1>'
,
'BBT_thread_cluster_arrange_order'
:
'S<0, 3, 1, 2>'
,
'BBT_src_access_order'
:
'S<0, 3, 1, 2>'
,
'BBT_src_vec_tensor_lengths_K0_N0_N1_K1'
:
'S<1, 1, 4, 1>'
,
'BBT_src_vec_tensor_cont_dim_order'
:
'S<0, 3, 1, 2>'
,
'BBT_dst_vec_tensor_lengths_K0_N0_N1_K1'
:
'S<1, 1, 4, 2>'
,
'CTT_src_dst_access_order'
:
'S<0, 1, 2, 3, 4, 5>'
,
'CTT_src_dst_vec_dim'
:
'5'
,
'CTT_dst_scalar_per_vector'
:
'4'
}
template
=
self
.
gemm_devop_template
cf
=
open
(
"ex.cpp"
,
'w'
)
print
(
SubstituteTemplate
(
template
,
values
))
cf
.
write
(
SubstituteTemplate
(
template
,
values
))
cf
.
close
()
m_template
=
self
.
make_template
cf
=
open
(
"Makefile"
,
'w'
)
print
(
SubstituteTemplate
(
m_template
,
values
))
cf
.
write
(
SubstituteTemplate
(
m_template
,
values
))
cf
.
close
()
PIPE
=
-
1
STDOUT
=
-
2
proc
=
subprocess
.
Popen
(
[
"make"
],
shell
=
True
,
env
=
os
.
environ
.
copy
(),
stdout
=
subprocess
.
PIPE
,
stderr
=
subprocess
.
PIPE
,
)
out
,
err
=
proc
.
communicate
()
a
=
EmitGemmInstance
()
a
.
emit
()
python/ait_impl/generation/ex/shared/gemm_ex_code.py
0 → 100644
View file @
f945f40a
import
enum
import
os.path
import
shutil
import
functools
import
operator
import
collections
import
re
def
SubstituteTemplate
(
template
,
values
):
text
=
template
changed
=
True
while
changed
:
changed
=
False
for
key
,
value
in
values
.
items
():
regex
=
"
\\
$
\\
{%s
\\
}"
%
key
newtext
=
re
.
sub
(
regex
,
value
,
text
)
if
newtext
!=
text
:
changed
=
True
text
=
newtext
return
text
class
EmitGemmInstance
:
def
__init__
(
self
):
self
.
gemm_devop_template
=
"""
#pragma once
#include "common.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_gemm_dl.hpp"
using ADataType = ck::half_t;
using BDataType = ck::half_t;
using CDataType = ck::half_t;
using AccDataType = float;
using ALayout = Col;
using BLayout = Row;
using CLayout = Row;
using AElementOp = PassThrough;
using BElementOp = PassThrough;
using CElementOp = PassThrough;
static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmDl
<ADataType,
BDataType,
CDataType,
AccDataType,
ALayout,
BLayout,
CLayout,
AElementOp,
BElementOp,
CElementOp,
GemmDefault,
256,
128,
128,
16,
2,
4,
4,
1,
S<8, 2>,
S<8, 2>,
S<2, 1, 4, 2>,
S<8, 1, 32, 1>,
S<0, 3, 1, 2>,
S<0, 3, 1, 2>,
S<1, 1, 4, 1>,
S<0, 3, 1, 2>,
S<1, 1, 4, 2>,
S<2, 1, 4, 2>,
S<8, 1, 32, 1>,
S<0, 3, 1, 2>,
S<0, 3, 1, 2>,
S<1, 1, 4, 1>,
S<0, 3, 1, 2>,
S<1, 1, 4, 2>,
S<0, 1, 2, 3, 4, 5>,
5,
4>;
bool run_gemm(const ProblemSize& problem_size, const ExecutionConfig& config)
{
using namespace ck::literals;
auto& [M, N, K, StrideA, StrideB, StrideC] = problem_size;
auto f_host_tensor_descriptor =
[](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
if constexpr(std::is_same_v<decltype(layout), ck::tensor_layout::gemm::RowMajor>)
{
return HostTensorDescriptor({row, col}, {stride, 1_uz});
}
else
{
return HostTensorDescriptor({row, col}, {1_uz, stride});
}
};
Tensor<ADataType> a_m_k(f_host_tensor_descriptor(M, K, StrideA, ALayout{}));
Tensor<BDataType> b_k_n(f_host_tensor_descriptor(K, N, StrideB, BLayout{}));
switch(config.init_method)
{
case 0: break;
case 1:
ck::utils::FillUniformDistributionIntegerValue<ADataType>{-5.f, 5.f}(a_m_k);
ck::utils::FillUniformDistributionIntegerValue<BDataType>{-5.f, 5.f}(b_k_n);
break;
default:
ck::utils::FillUniformDistribution<ADataType>{-1.f, 1.f}(a_m_k);
ck::utils::FillUniformDistribution<BDataType>{-1.f, 1.f}(b_k_n);
}
Tensor<CDataType> c_m_n_host_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
Tensor<CDataType> c_m_n_device_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
std::cout << "a_m_k: " << a_m_k.mDesc << std::endl;
std::cout << "b_k_n: " << b_k_n.mDesc << std::endl;
std::cout << "c_m_n: " << c_m_n_host_result.mDesc << std::endl;
DeviceMem a_m_k_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpaceSize());
DeviceMem b_k_n_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpaceSize());
DeviceMem c_m_n_device_buf(sizeof(CDataType) * c_m_n_device_result.mDesc.GetElementSpaceSize());
a_m_k_device_buf.ToDevice(a_m_k.mData.data());
b_k_n_device_buf.ToDevice(b_k_n.mData.data());
auto a_element_op = AElementOp{};
auto b_element_op = BElementOp{};
auto c_element_op = CElementOp{};
// do GEMM
auto gemm = DeviceGemmInstance{};
auto invoker = gemm.MakeInvoker();
auto argument = gemm.MakeArgument(
static_cast<ADataType*>(a_m_k_device_buf.GetDeviceBuffer()),
static_cast<BDataType*>(b_k_n_device_buf.GetDeviceBuffer()),
static_cast<CDataType*>(c_m_n_device_buf.GetDeviceBuffer()),
M,
N,
K,
StrideA,
StrideB,
StrideC,
a_element_op,
b_element_op,
c_element_op);
if(!gemm.IsSupportedArgument(argument))
{
std::cerr << gemm.GetTypeString() << " does not support this problem" << std::endl;
return true;
}
float ave_time = invoker.Run(argument, StreamConfig{nullptr, config.time_kernel});
std::size_t flop = 2_uz * M * N * K;
std::size_t num_btype =
sizeof(ADataType) * M * K + sizeof(BDataType) * K * N + sizeof(CDataType) * M * N;
float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
float gb_per_sec = num_btype / 1.E6 / ave_time;
std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
<< gemm.GetTypeString() << std::endl;
if(config.do_verification)
{
auto ref_gemm = ReferenceGemmInstance{};
auto ref_invoker = ref_gemm.MakeInvoker();
auto ref_argument = ref_gemm.MakeArgument(
a_m_k, b_k_n, c_m_n_host_result, a_element_op, b_element_op, c_element_op);
ref_invoker.Run(ref_argument);
c_m_n_device_buf.FromDevice(c_m_n_device_result.mData.data());
return ck::utils::check_err(c_m_n_device_result, c_m_n_host_result);
}
return true;
}
bool run_gemm_example(int argc, char* argv[])
{
ProblemSize problem_size;
ExecutionConfig config;
return !parse_cmd_args(argc, argv, problem_size, config) || run_gemm(problem_size, config);
}
"""
def
emit
(
self
):
values
=
{
'type_a'
:
'ck::half_t'
,
}
template
=
self
.
gemm_devop_template
cf
=
open
(
"xx.cpp"
,
'w'
)
print
(
SubstituteTemplate
(
template
,
values
))
cf
.
write
(
SubstituteTemplate
(
template
,
values
))
cf
.
close
()
a
=
EmitGemmInstance
()
a
.
emit
()
python/ait_impl/generation/ex/shared/input.py
0 → 100644
View file @
f945f40a
#take in input for gemm from user, send it to example template
\ No newline at end of file
python/ait_impl/generation/gemm_dev_op.py
deleted
100644 → 0
View file @
3af15212
This diff is collapsed.
Click to expand it.
python/ait_impl/generation/gemm_kernel.py
deleted
100644 → 0
View file @
3af15212
import
enum
import
os.path
import
shutil
import
functools
import
operator
import
collections
import
re
def
SubstituteTemplate
(
template
,
values
):
text
=
template
changed
=
True
while
changed
:
changed
=
False
for
key
,
value
in
values
.
items
():
regex
=
"
\\
$
\\
{%s
\\
}"
%
key
newtext
=
re
.
sub
(
regex
,
value
,
text
)
if
newtext
!=
text
:
changed
=
True
text
=
newtext
return
text
class
EmitGemmInstance
:
def
__init__
(
self
):
self
.
gemm_kernel_template
=
"""
template <typename GridwiseGemm,
typename FloatAB,
typename FloatC,
typename AGridDesc_K0_M0_M1_K1,
typename BGridDesc_K0_N0_N1_K1,
typename CGridDesc_M0_M10_M11_N0_N10_N11,
typename Block2CTileMap,
bool HasMainKBlockLoop,
bool HasDoubleTailKBlockLoop>
__global__ void
#if CK_USE_LAUNCH_BOUNDS
__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
#endif
kernel_gemm_dl_v1r3(const ${type_ab}* __restrict__ ${p_a_grid},
const ${type_ab}* __restrict__ ${p_b_grid},
${type_c}* __restrict__ ${p_c_grid},
const ${A_GridDesc_K0_M_K1} ${a_grid_desc_k0_m0_m1_k1},
const ${BGridDesc_K0_N_K1} ${b_grid_desc_k0_n0_n1_k1},
const ${CGridDesc_M0_M10_M11_N0_N10_N11} ${c_grid_desc_m0_m10_m11_n0_n10_n11},
const Block2CTileMap ${block_2_ctile_map})
{
constexpr index_t shared_block_size =
GridwiseGemm::GetSharedMemoryNumberOfByte() / sizeof(${type_ab});
__shared__ ${type_ab} p_shared_block[shared_block_size];
GridwiseGemm::Run(${p_a_grid},
${p_b_grid},
${p_c_grid},
p_shared_block,
${a_grid_desc_k0_m0_m1_k1},
${b_grid_desc_k0_n0_n1_k1},
${c_grid_desc_m0_m10_m11_n0_n10_n11},
${block_2_ctile_map},
integral_constant<bool, HasMainKBlockLoop>{},
integral_constant<bool, HasDoubleTailKBlockLoop>{});
}
template <index_t BlockSize,
${type_ab},
${type_acc},
${type_c},
InMemoryDataOperationEnum CGlobalMemoryDataOperation,
${A_GridDesc_K0_M_K1},
${BGridDesc_K0_N_K1},
${CGridDesc_M_N},
${mperblock},
${nperblock},
${k0perblock},
${k1value},
${M1PerThreadM111},
${N1PerThreadN111},
${KPerThread},
${M11N11ThreadClusterM110Xs},
${M11N11ThreadClusterN110Xs},
${ABlockTransferThreadSliceLengths_K0_M0_M1_K1},
${ABlockTransferThreadClusterLengths_K0_M0_M1_K1},
${ABlockTransferThreadClusterArrangeOrder},
${ABlockTransferSrcAccessOrder},
${ABlockTransferSrcVectorTensorLengths_K0_M0_M1_K1},
${ABlockTransferSrcVectorTensorContiguousDimOrder},
${ABlockTransferDstVectorTensorLengths_K0_M0_M1_K1},
${BBlockTransferThreadSliceLengths_K0_N0_N1_K1},
${BBlockTransferThreadClusterLengths_K0_N0_N1_K1},
${BBlockTransferThreadClusterArrangeOrder},
${BBlockTransferSrcAccessOrder},
${BBlockTransferSrcVectorTensorLengths_K0_N0_N1_K1},
${BBlockTransferSrcVectorTensorContiguousDimOrder},
${BBlockTransferDstVectorTensorLengths_K0_N0_N1_K1},
${CThreadTransferSrcDstAccessOrder},
${CThreadTransferSrcDstVectorDim},
${CThreadTransferDstScalarPerVector}>
struct GridwiseGemmDl_km_kn_mn_v1r3
{
static constexpr auto I0 = Number<0>{};
static constexpr auto I1 = Number<1>{};
static constexpr auto I2 = Number<2>{};
static constexpr auto I3 = Number<3>{};
// K1 should be Number<...>
static constexpr auto K1 = Number<K1Value>{};
__host__ __device__ static constexpr index_t GetSharedMemoryNumberOfByte()
{
// TODO: change this. I think it needs multi-dimensional alignment
constexpr auto max_lds_align = K1;
// TODO: check alignment
// A matrix in LDS memory, dst of blockwise copy
constexpr auto a_block_desc_k_m = make_naive_tensor_descriptor_aligned(
make_tuple(Number<K0PerBlock>{}, Number<MPerBlock>{}, K1), max_lds_align);
// TODO: check alignment
// B matrix in LDS memory, dst of blockwise copy
constexpr auto b_block_desc_k_n = make_naive_tensor_descriptor_aligned(
make_tuple(Number<K0PerBlock>{}, Number<NPerBlock>{}, K1), max_lds_align);
// TODO: check alignment
// LDS allocation for A and B: be careful of alignment
constexpr auto a_block_aligned_space_size =
math::integer_least_multiple(a_block_desc_k_m.GetElementSpaceSize(), max_lds_align);
constexpr auto b_block_aligned_space_size =
math::integer_least_multiple(b_block_desc_k_n.GetElementSpaceSize(), max_lds_align);
return 2 * (a_block_aligned_space_size + b_block_aligned_space_size) * sizeof(FloatAB);
}
"""
def
emit
(
self
):
values
=
{
'function_name'
:
"gemm"
,
'type_a'
:
'ck::half_t'
,
'type_b'
:
'ck::half_t'
,
'type_c'
:
'ck::half_t'
,
'type_acc'
:
'float'
,
'layout_a'
:
'ck::tensor_layout::gemm::RowMajor'
,
'layout_b'
:
'ck::tensor_layout::gemm::RowMajor'
,
'layout_c'
:
'ck::tensor_layout::gemm::RowMajor'
,
'elementwise_op_a'
:
'ck::tensor_operation::element_wise::PassThrough'
,
'elementwise_op_b'
:
'ck::tensor_operation::element_wise::PassThrough'
,
'elementwise_op_c'
:
'ck::tensor_operation::element_wise::PassThrough'
,
'Gemm_spec'
:
'ck::tensor_operation::device::GemmSpecialization::MNKPadding'
,
'block_size'
:
'256'
,
'mperblock'
:
'64'
,
'nperblock'
:
'128'
,
'kperblock'
:
'32'
,
'k1'
:
'8'
,
'mperxdl'
:
'32'
,
'nperxdl'
:
'32'
,
'mxdlperwave'
:
'1'
,
'nxdlperwave'
:
'2'
,
'threadclusterlength_a'
:
'ck::Sequence<4,64,1>'
,
'threadclusterarrange_a'
:
'ck::Sequence<1,0,2>'
,
'srcaccessorder_a'
:
'ck::Sequence<1,0,2>'
,
'srcvectordim_a'
:
'2'
,
'srcscalarpervec_a'
:
'8'
,
'dstscalarpervec_a'
:
'8'
,
'add_extra_dim_a'
:
'1'
,
'threadclusterlength_b'
:
'ck::Sequence<8,32,1>'
,
'threadclusterarrange_b'
:
'ck::Sequence<0,2,1>'
,
'srcaccessorder_b'
:
'ck::Sequence<0,2,1>'
,
'srcvectordim_b'
:
'1'
,
'srcscalarpervec_b'
:
'4'
,
'dstscalarpervec_b'
:
'2'
,
'add_extra_dim_b'
:
'0'
,
'dstscalarpervec_c'
:
'8'
}
template
=
self
.
gemm_template
print
(
SubstituteTemplate
(
template
,
values
))
\ No newline at end of file
python/ait_impl/generation/norm_ex.py
deleted
100644 → 0
View file @
3af15212
import
os
import
re
from
hashlib
import
sha1
from
typing
import
Any
,
Dict
,
OrderedDict
import
jinja2
#from ...target import Target
#templating
FUNC_CALL_PARAM_TEMPLATE
=
jinja2
.
Template
(
"(void *)({{name}})"
)
INSTANCE_TEMPLATE
=
jinja2
.
Template
(
"""
using {{name}} = {{ config_name }};
"""
)
ARGS_PARSE_TEMPLATE
=
jinja2
.
Template
(
"""
{% for idx in range(rank) %}
const int64_t in_{{idx}} = std::stoi(argv[{{ idx + 1 }}]);
{% endfor %}
"""
)
STRUCTS_DEF_TEMPLATE
=
jinja2
.
Template
(
"""
struct ProfilerMemoryPool {
ProfilerMemoryPool() {
std::random_device rd;
gen = std::mt19937(rd());
uniform_dist = std::uniform_int_distribution<int64_t>(1, 48964896);
offsets.reserve(512);
strides.reserve(512);
copies.reserve(512);
ptrs.reserve(512);
}
~ProfilerMemoryPool() {
for(int i = 0; i < ptrs.size(); i++){
hipFree(ptrs[i]);
}
}
template <typename DType>
DType* AllocateGaussianTensor(int64_t size) {
size_t length = size * sizeof(DType);
DType *d_x;
hipMalloc(&d_x, length);
float mean = 0.0f;
float stddev = 1.0f;
uint64_t seed = uniform_dist(gen);
rocrand_set_seed(generator, seed);
rocrand_generate_normal(generator, reinterpret_cast<float*>(d_x), size, mean, stddev);
return d_x;
}
ck::half_t* AllocateHalfGaussianTensor(int64_t size) {
return reinterpret_cast<ck::half_t*>(
AllocateGaussianTensor<ck::half_t>(size));
}
int AllocateHalfTensor(int64_t size, int64_t copy) {
offsets.push_back(0);
strides.push_back(size);
copies.push_back(copy);
auto ptr = AllocateHalfGaussianTensor(size * copy);
ptrs.push_back(reinterpret_cast<void*>(ptr));
return ptrs.size() - 1;
}
ck::half_t* RequestHalfTensorByIdx(int idx) {
auto copy = copies.at(idx);
auto offset = offsets.at(idx);
auto stride = strides.at(idx);
ck::half_t* ptr = reinterpret_cast<ck::half_t*>(ptrs.at(idx));
ptr += offset;
offset += stride;
if (offset == copy * stride) {
offset = 0;
}
offsets[idx] = offset;
return ptr;
}
std::vector<int64_t> offsets;
std::vector<int64_t> strides;
std::vector<int64_t> copies;
std::vector<void*> ptrs;
std::mt19937 gen;
std::uniform_int_distribution<int64_t> uniform_dist;
rocrand_generator generator;
};
// hack for DeviceMem linking error
// TODO fix this by making CK a header-only lib
// <<< hack begin
DeviceMem::DeviceMem(std::size_t mem_size) : mMemSize(mem_size)
{
hipGetErrorString(hipMalloc(static_cast<void**>(&mpDeviceBuf), mMemSize));
}
void* DeviceMem::GetDeviceBuffer() const { return mpDeviceBuf; }
void DeviceMem::ToDevice(const void* p) const
{
hipGetErrorString(
hipMemcpy(mpDeviceBuf, const_cast<void*>(p), mMemSize, hipMemcpyHostToDevice));
}
void DeviceMem::FromDevice(void* p) const
{
hipGetErrorString(hipMemcpy(p, mpDeviceBuf, mMemSize, hipMemcpyDeviceToHost));
}
DeviceMem::~DeviceMem() { hipGetErrorString(hipFree(mpDeviceBuf)); }
struct KernelTimerImpl
{
KernelTimerImpl() {
hipGetErrorString(hipEventCreate(&mStart));
hipGetErrorString(hipEventCreate(&mEnd));
}
~KernelTimerImpl() {
hipGetErrorString(hipEventDestroy(mStart));
hipGetErrorString(hipEventDestroy(mEnd));
}
void Start() {
hipGetErrorString(hipDeviceSynchronize());
hipGetErrorString(hipEventRecord(mStart, nullptr));
}
void End() {
hipGetErrorString(hipEventRecord(mEnd, nullptr));
hipGetErrorString(hipEventSynchronize(mEnd));
}
float GetElapsedTime() const {
float time;
hipGetErrorString(hipEventElapsedTime(&time, mStart, mEnd));
return time;
}
hipEvent_t mStart, mEnd;
};
// >>> hack end
"""
)
FUNC_TEMPLATE
=
jinja2
.
Template
(
"""
#include <iostream>
#include <numeric>
#include <initializer_list>
#include <cstdlib>
#include <stdlib.h>
#include <random>
#include <rocrand/rocrand.h>
#include "include/ck/utility/print.hpp"
#include "library/include/ck/library/utility/device_memory.hpp"
#include "library/include/ck/library/utility/host_tensor.hpp"
#include "library/include/ck/library/utility/host_tensor_generator.hpp"
#include "include/ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "include/ck/utility/reduction_operator.hpp"
{{extra_headers}}
{{extra_code}}
{{instances_decl}}
{{func_signature}}
{
{{shape_eval}}
{{exec_paths}}
}
"""
)
FUNC_CALL_TEMPLATE
=
jinja2
.
Template
(
"""
{{indent}}{{func_name}}(
{{indent}} {{input}},
{{indent}} {{output}},
{% for name in input_dim_names %}
{{indent}} const_cast<int64_t *>(&{{name}}),
{% endfor %}
{{indent}} stream
{{indent}});
"""
)
PROFILER_TEMPLATE
=
jinja2
.
Template
(
"""
size_t GLOBAL_WORKSPACE_SIZE = 0;
{{op_func}}
{{structs_def}}
int main(int argc, char** argv) {
{{args_parse}}
auto memory_pool = std::make_unique<ProfilerMemoryPool>();
hipStream_t stream = nullptr;
{{tensor_decl}}
// warmup
for(int i = 0; i < 3; ++i) {
{{func_call}}
}
// run
KernelTimerImpl timer;
timer.Start();
for(int i = 0; i < 5; ++i) {
{{func_call}}
}
timer.End();
std::cout << "WS:" <<GLOBAL_WORKSPACE_SIZE<<std::endl;
std::cout << "TIME:" << timer.GetElapsedTime() << std::endl;
}
"""
)
# rendering (messy, need to modularize and organize)
# def gen_profiler(
# shape_eval_template: jinja2.Template,
# exec_template: jinja2.Template,
# tensor_decl_template: jinja2.Template,
# extra_header_template: jinja2.Template,
# get_func_signature: Any,
# extra_code: str = "",
# func_call_template: jinja2.Template = FUNC_CALL_TEMPLATE,
# indent: str = " ",
# ) -> str:
# shape_eval_template: jinja2.Template
# exec_template: jinja2.Template
# tensor_decl_template: jinja2.Template
#extra_header_template: jinja2.Template
get_func_signature
:
Any
extra_code
:
str
=
""
func_call_template
:
jinja2
.
Template
=
FUNC_CALL_TEMPLATE
indent
:
str
=
" "
# shape_eval = shape_eval_template.render(rank=2) #if shape_eval_template else ""
# exe_path = exec_template.render(instance="DeviceInstance",dtype="void",reduce_dims=1,rank=2,eps=eps,)
instances
=
INSTANCE_TEMPLATE
.
render
(
name
=
"DeviceInstance"
,
config_name
=
"ck::tensor_operation::device::DeviceLayernormImpl"
,)
op_func
=
FUNC_TEMPLATE
.
render
(
instances_decl
=
instances
,
#func_signature=get_func_signature(func_attrs),
#shape_eval=shape_eval,
#exec_paths=exe_path,
#extra_headers=extra_header_template.render(),
extra_code
=
extra_code
,)
structs_def
=
STRUCTS_DEF_TEMPLATE
.
render
()
args_parse
=
ARGS_PARSE_TEMPLATE
.
render
(
rank
=
2
)
#tensor_decl = tensor_decl_template.render(rank=2)
input_dim_names
=
[
f
"in_
{
i
}
"
for
i
in
range
(
2
)]
func_call
=
func_call_template
.
render
(
func_name
=
"norm"
,
input
=
"(void *) memory_pool->RequestHalfTensorByIdx(0)"
,
gamma
=
"(void *) memory_pool->RequestHalfTensorByIdx(2)"
,
beta
=
"(void *) memory_pool->RequestHalfTensorByIdx(3)"
,
output
=
"(void *) memory_pool->RequestHalfTensorByIdx(1)"
,
input_dim_names
=
input_dim_names
,
indent
=
indent
,
)
code
=
PROFILER_TEMPLATE
.
render
(
op_func
=
op_func
,
structs_def
=
structs_def
,
args_parse
=
args_parse
,
#tensor_decl=tensor_decl,
func_call
=
func_call
,
)
# print(instances)
# print(args_parse)
# print(structs_def)
#print(func_call)
#print(op_func)
print
(
code
)
python/ait_impl/generation/permute_ex.py
deleted
100644 → 0
View file @
3af15212
import
jinja2
EXTRA_SHAPE_TEMPLATE
=
jinja2
.
Template
(
"""
{{indent}}const int64_t stride_a = *a_dim1;
{{indent}}const int64_t stride_b = *b_dim1;
{{indent}}const int64_t stride_c = *c_dim1;
ck::index_t M0 = M / G1 / G2;
ck::index_t M1 = G1;
ck::index_t M2 = G2;
ck::index_t N0 = G3;
ck::index_t N1 = N / G3;
// GEMM shape
//ck::index_t M = M0 * M1 * M2;
//ck::index_t N = N0 * N1;
//ck::index_t K = 128;
//ck::index_t stride_A = K;
//ck::index_t stride_B = K;
// E = [M0, N0, M1, N1, M2]
/* 0, 3, 1, 4, 2
ck::index_t stride_E_M0 = N0 * M1 * N1 * M2;
ck::index_t stride_E_M1 = N1 * M2;
ck::index_t stride_E_M2 = 1;
ck::index_t stride_E_N0 = M1 * N1 * M2;
ck::index_t stride_E_N1 = M2;
*/
// E = [M2, M0, N0, M1, N1] 2, 0, 3, 1, 4
ck::index_t stride_E_M0 = N0* M1* N1;
ck::index_t stride_E_M1 = N1;
ck::index_t stride_E_M2 = M0* N0* M1* N1;
ck::index_t stride_E_N0 = M1 * N1;
ck::index_t stride_E_N1 = 1;
// D = [0, N0, 0, N1, 0]
ck::index_t stride_D_M0 = 0;
ck::index_t stride_D_M1 = 0;
ck::index_t stride_D_M2 = 0;
ck::index_t stride_D_N0 = N1;
ck::index_t stride_D_N1 = 1;
"""
)
output
=
EXTRA_SHAPE_TEMPLATE
.
render
(
indent
=
" "
);
print
(
output
)
\ No newline at end of file
python/ait_impl/generation/xx.cpp
deleted
100644 → 0
View file @
3af15212
This diff is collapsed.
Click to expand it.
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment