Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel_ROCM
Commits
6ac1d6a2
Commit
6ac1d6a2
authored
Mar 11, 2024
by
illsilin
Browse files
merging from public repo
parents
e60c5aea
42fc8edd
Changes
303
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
1464 additions
and
34 deletions
+1464
-34
client_example/24_grouped_conv_activation/grouped_convnd_fwd_scaleadd_scaleadd_relu/grouped_conv_fwd_scaleadd_scaleadd_relu_bf16.cpp
...add_relu/grouped_conv_fwd_scaleadd_scaleadd_relu_bf16.cpp
+1
-1
client_example/24_grouped_conv_activation/grouped_convnd_fwd_scaleadd_scaleadd_relu/grouped_conv_fwd_scaleadd_scaleadd_relu_fp16.cpp
...add_relu/grouped_conv_fwd_scaleadd_scaleadd_relu_fp16.cpp
+1
-1
client_example/24_grouped_conv_activation/grouped_convnd_fwd_scaleadd_scaleadd_relu/grouped_conv_fwd_scaleadd_scaleadd_relu_fp32.cpp
...add_relu/grouped_conv_fwd_scaleadd_scaleadd_relu_fp32.cpp
+1
-1
client_example/24_grouped_conv_activation/grouped_convnd_fwd_scaleadd_scaleadd_relu/grouped_conv_fwd_scaleadd_scaleadd_relu_int8.cpp
...add_relu/grouped_conv_fwd_scaleadd_scaleadd_relu_int8.cpp
+1
-1
client_example/24_grouped_convnd_fwd_scaleadd_ab/CMakeLists.txt
..._example/24_grouped_convnd_fwd_scaleadd_ab/CMakeLists.txt
+0
-11
client_example/25_wrapper/CMakeLists.txt
client_example/25_wrapper/CMakeLists.txt
+8
-0
client_example/25_wrapper/README.md
client_example/25_wrapper/README.md
+177
-0
client_example/25_wrapper/wrapper_basic_gemm.cpp
client_example/25_wrapper/wrapper_basic_gemm.cpp
+215
-0
client_example/25_wrapper/wrapper_img2col.cpp
client_example/25_wrapper/wrapper_img2col.cpp
+23
-19
client_example/25_wrapper/wrapper_optimized_gemm.cpp
client_example/25_wrapper/wrapper_optimized_gemm.cpp
+307
-0
cmake/Embed.cmake
cmake/Embed.cmake
+238
-0
codegen/CMakeLists.txt
codegen/CMakeLists.txt
+49
-0
codegen/driver/main.cpp
codegen/driver/main.cpp
+71
-0
codegen/include/ck/host/device_gemm_multiple_d.hpp
codegen/include/ck/host/device_gemm_multiple_d.hpp
+42
-0
codegen/include/ck/host/device_gemm_multiple_d/operation.hpp
codegen/include/ck/host/device_gemm_multiple_d/operation.hpp
+42
-0
codegen/include/ck/host/device_gemm_multiple_d/problem.hpp
codegen/include/ck/host/device_gemm_multiple_d/problem.hpp
+39
-0
codegen/include/ck/host/headers.hpp
codegen/include/ck/host/headers.hpp
+18
-0
codegen/include/ck/host/operation/gemm.hpp
codegen/include/ck/host/operation/gemm.hpp
+49
-0
codegen/include/ck/host/stringutils.hpp
codegen/include/ck/host/stringutils.hpp
+104
-0
codegen/include/ck/host/types.hpp
codegen/include/ck/host/types.hpp
+78
-0
No files found.
client_example/2
3_
grouped_convnd_fwd_scaleadd_scaleadd_relu/grouped_conv_fwd_scaleadd_scaleadd_relu_bf16.cpp
→
client_example/2
4_grouped_conv_activation/
grouped_convnd_fwd_scaleadd_scaleadd_relu/grouped_conv_fwd_scaleadd_scaleadd_relu_bf16.cpp
View file @
6ac1d6a2
// SPDX-License-Identifier: MIT
// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2023
-2024
, Advanced Micro Devices, Inc. All rights reserved.
#include <tuple>
...
...
client_example/2
3_
grouped_convnd_fwd_scaleadd_scaleadd_relu/grouped_conv_fwd_scaleadd_scaleadd_relu_fp16.cpp
→
client_example/2
4_grouped_conv_activation/
grouped_convnd_fwd_scaleadd_scaleadd_relu/grouped_conv_fwd_scaleadd_scaleadd_relu_fp16.cpp
View file @
6ac1d6a2
// SPDX-License-Identifier: MIT
// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2023
-2024
, Advanced Micro Devices, Inc. All rights reserved.
#include <tuple>
...
...
client_example/2
3_
grouped_convnd_fwd_scaleadd_scaleadd_relu/grouped_conv_fwd_scaleadd_scaleadd_relu_fp32.cpp
→
client_example/2
4_grouped_conv_activation/
grouped_convnd_fwd_scaleadd_scaleadd_relu/grouped_conv_fwd_scaleadd_scaleadd_relu_fp32.cpp
View file @
6ac1d6a2
// SPDX-License-Identifier: MIT
// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2023
-2024
, Advanced Micro Devices, Inc. All rights reserved.
#include <tuple>
...
...
client_example/2
3_
grouped_convnd_fwd_scaleadd_scaleadd_relu/grouped_conv_fwd_scaleadd_scaleadd_relu_int8.cpp
→
client_example/2
4_grouped_conv_activation/
grouped_convnd_fwd_scaleadd_scaleadd_relu/grouped_conv_fwd_scaleadd_scaleadd_relu_int8.cpp
View file @
6ac1d6a2
// SPDX-License-Identifier: MIT
// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2023
-2024
, Advanced Micro Devices, Inc. All rights reserved.
#include <tuple>
...
...
client_example/24_grouped_convnd_fwd_scaleadd_ab/CMakeLists.txt
deleted
100644 → 0
View file @
e60c5aea
add_executable
(
client_grouped_convnd_fwd_scaleadd_ab_fp32 grouped_conv_fwd_scaleadd_ab_fp32.cpp
)
target_link_libraries
(
client_grouped_convnd_fwd_scaleadd_ab_fp32 PRIVATE composable_kernel::device_conv_operations
)
add_executable
(
client_grouped_convnd_fwd_scaleadd_ab_fp16 grouped_conv_fwd_scaleadd_ab_fp16.cpp
)
target_link_libraries
(
client_grouped_convnd_fwd_scaleadd_ab_fp16 PRIVATE composable_kernel::device_conv_operations
)
add_executable
(
client_grouped_convnd_fwd_scaleadd_ab_bf16 grouped_conv_fwd_scaleadd_ab_bf16.cpp
)
target_link_libraries
(
client_grouped_convnd_fwd_scaleadd_ab_bf16 PRIVATE composable_kernel::device_conv_operations
)
add_executable
(
client_grouped_convnd_fwd_scaleadd_ab_int8 grouped_conv_fwd_scaleadd_ab_int8.cpp
)
target_link_libraries
(
client_grouped_convnd_fwd_scaleadd_ab_int8 PRIVATE composable_kernel::device_conv_operations
)
client_example/25_wrapper/CMakeLists.txt
View file @
6ac1d6a2
...
...
@@ -2,3 +2,11 @@ add_executable(client_tensor_transform_using_wrapper tensor_transform_using_wrap
target_link_libraries
(
client_tensor_transform_using_wrapper PRIVATE composable_kernel::device_other_operations
)
add_executable
(
client_wrapper_img2col wrapper_img2col.cpp
)
target_link_libraries
(
client_wrapper_img2col PRIVATE composable_kernel::device_other_operations
)
if
(
GPU_TARGETS MATCHES
"gfx908"
OR GPU_TARGETS MATCHES
"gfx90a"
OR
GPU_TARGETS MATCHES
"gfx940"
OR GPU_TARGETS MATCHES
"gfx941"
OR
GPU_TARGETS MATCHES
"gfx942"
)
add_executable
(
client_wrapper_basic_gemm wrapper_basic_gemm.cpp
)
target_link_libraries
(
client_wrapper_basic_gemm PRIVATE composable_kernel::device_other_operations
)
add_executable
(
client_wrapper_optimized_gemm wrapper_optimized_gemm.cpp
)
target_link_libraries
(
client_wrapper_optimized_gemm PRIVATE composable_kernel::device_other_operations
)
endif
()
client_example/25_wrapper/README.md
0 → 100644
View file @
6ac1d6a2
# Composable Kernel wrapper GEMM tutorial
This tutorial demonstrates how to implement matrix multiplication using Composable Kernel (CK)
wrapper. We present the base version of GEMM without most of the available optimizations; however,
it's worth noting that CK has kernels with different optimizations.
To implement these optimizations, you can use the CK wrapper or directly use available instances in
CK. You can also refer to the
[
optimized GEMM example
](
https://github.com/ROCm/composable_kernel/blob/develop/client_example/25_wrapper/wrapper_optimized_gemm.cpp
)
,
that uses CK wrapper based on the
[
`gridwise_gemm_xdlops_v2r3`
](
https://github.com/ROCm/composable_kernel/blob/develop/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r3.hpp
)
implementation.
The kernel definition should look similar to:
```
cpp
template
<
typename
DataType
,
typename
GemmTraits
,
ck
::
index_t
scalar_per_vector
,
typename
BlockShape
,
typename
ThreadLayout
>
__global__
void
__CK_WRAPPER_LAUNCH_BOUNDS__
DeviceGemm
(
const
void
*
p_a
,
const
void
*
p_b
,
void
*
p_c
,
const
ck
::
index_t
M
,
const
ck
::
index_t
N
,
const
ck
::
index_t
K
,
const
BlockShape
tile_shape
,
const
ThreadLayout
thread_layout
)
```
We pass pointers to global memory and matrix dimensions via arguments. Additionally, we pass
selected lengths of processed data through each block (
`tile_shape`
) and thread layout
(
`thread_layout`
). For compilation time parameters, we define the data type,
[
traits for the GEMM operation
](
https://github.com/ROCm/composable_kernel/blob/develop/include/ck/wrapper/traits/blockwise_gemm_xdl_traits.hpp
)
and scalar per vector value during copy.
Step 1: Create layouts for global and LDS memory.
```
cpp
// Specify layouts for global memory.
const
auto
a_global_layout
=
ck
::
wrapper
::
make_layout
(
ck
::
make_tuple
(
M
,
K
),
ck
::
make_tuple
(
K
,
1
));
const
auto
b_global_layout
=
ck
::
wrapper
::
make_layout
(
ck
::
make_tuple
(
N
,
K
),
ck
::
make_tuple
(
K
,
1
));
const
auto
c_global_layout
=
ck
::
wrapper
::
make_layout
(
ck
::
make_tuple
(
M
,
N
),
ck
::
make_tuple
(
N
,
1
));
// Specify layouts for tiles.
constexpr
auto
a_tile_layout
=
ck
::
wrapper
::
make_layout
(
ck
::
make_tuple
(
MPerBlock
,
KPerBlock
),
ck
::
make_tuple
(
KPerBlock
,
ck
::
Number
<
1
>
{}));
constexpr
auto
b_tile_layout
=
ck
::
wrapper
::
make_layout
(
ck
::
make_tuple
(
NPerBlock
,
KPerBlock
),
ck
::
make_tuple
(
KPerBlock
,
ck
::
Number
<
1
>
{}));
constexpr
auto
c_tile_layout
=
ck
::
wrapper
::
make_layout
(
ck
::
make_tuple
(
MPerBlock
,
NPerBlock
),
ck
::
make_tuple
(
NPerBlock
,
ck
::
Number
<
1
>
{}));
// Apply padding for global memory.
auto
a_global_layout_padded
=
ck
::
wrapper
::
pad
(
a_global_layout
,
shape
(
a_tile_layout
));
auto
b_global_layout_padded
=
ck
::
wrapper
::
pad
(
b_global_layout
,
shape
(
b_tile_layout
));
auto
c_global_layout_padded
=
ck
::
wrapper
::
pad
(
c_global_layout
,
shape
(
c_tile_layout
));
```
We pad layouts for global tensors in case M, N, and K are not divisible by
`MPerBlock`
,
`NPerBlock`
, or
`KPerBlock`
.
Step 2: Create tensors for global and LDS memory.
```
cpp
// Make tensors for global memory.
auto
a_global_tensor
=
ck
::
wrapper
::
make_tensor
<
ck
::
wrapper
::
MemoryTypeEnum
::
Global
>
(
static_cast
<
const
DataType
*>
(
p_a
),
a_global_layout_padded
);
auto
b_global_tensor
=
ck
::
wrapper
::
make_tensor
<
ck
::
wrapper
::
MemoryTypeEnum
::
Global
>
(
static_cast
<
const
DataType
*>
(
p_b
),
b_global_layout_padded
);
auto
c_global_tensor
=
ck
::
wrapper
::
make_tensor
<
ck
::
wrapper
::
MemoryTypeEnum
::
Global
>
(
static_cast
<
DataType
*>
(
p_c
),
c_global_layout_padded
);
// Allocate LDS memory.
__shared__
DataType
lds_a
[
ck
::
wrapper
::
size
(
a_tile_layout
)];
__shared__
DataType
lds_b
[
ck
::
wrapper
::
size
(
b_tile_layout
)];
// Make tensors for lds memory.
auto
a_lds_tensor
=
ck
::
wrapper
::
make_tensor
<
ck
::
wrapper
::
MemoryTypeEnum
::
Lds
>
(
static_cast
<
DataType
*>
(
lds_a
),
a_tile_layout
);
auto
b_lds_tensor
=
ck
::
wrapper
::
make_tensor
<
ck
::
wrapper
::
MemoryTypeEnum
::
Lds
>
(
static_cast
<
DataType
*>
(
lds_b
),
b_tile_layout
);
```
We must specify parameters for copy and convert block indexes to tuple:
```
cpp
// Specify block index as tuple.
const
auto
block_idxs
=
ck
::
make_tuple
(
static_cast
<
ck
::
index_t
>
(
blockIdx
.
x
),
static_cast
<
ck
::
index_t
>
(
blockIdx
.
y
),
ck
::
wrapper
::
slice
());
// Specify access parameters for copy.
using
DimAccessOrder
=
ck
::
Tuple
<
ck
::
Number
<
0
>
,
ck
::
Number
<
1
>>
;
constexpr
ck
::
index_t
vector_dim
=
1
;
```
We create a local tile (per block) and local partitions (per thread) for the global memory (
`C`
). We also
define and clear an output register (
`c_vgpr_reg`
) for the accumulation.
```
cpp
auto
c_global_local_tile
=
ck
::
wrapper
::
make_local_tile
(
c_global_tensor
,
tile_shape
,
block_idxs
,
make_tuple
(
ck
::
Number
<
1
>
{},
ck
::
Number
<
1
>
{},
ck
::
wrapper
::
slice
(
KPerBlock
)));
auto
c_global_local_partition
=
ck
::
wrapper
::
make_blockwise_gemm_xdl_c_local_partition
<
DataType
,
decltype
(
a_tile_layout
),
decltype
(
b_tile_layout
),
ck
::
wrapper
::
size
(
thread_layout
),
GemmTraits
>
(
c_global_local_tile
);
// Create C vgpr to accumulate results.
auto
c_vgpr_reg
=
ck
::
wrapper
::
make_blockwise_gemm_xdl_c_vgpr
<
DataType
,
decltype
(
a_tile_layout
),
decltype
(
b_tile_layout
),
ck
::
wrapper
::
size
(
thread_layout
),
GemmTraits
>
();
// Clear C vgpr.
ck
::
wrapper
::
clear
(
c_vgpr_reg
);
```
We use two specific functions for
`blockwise_gemm`
:
`make_blockwise_gemm_xdl_c_local_partition`
and
`make_blockwise_gemm_xdl_c_vgpr`
. This helps to choose the appropriate partition for the
`C`
output
and define tensors with specific layouts for
`blockwise_gemm`
. In the following step, we use only
generic functions for the CK wrapper.
Step 3: Create the compute loop.
```
cpp
const
ck
::
index_t
num_loop
=
ck
::
math
::
integer_divide_ceil
(
K
,
KPerBlock
);
ck
::
index_t
i
=
0
;
do
{
// Get KPerBlock slice.
const
auto
k_slice
=
ck
::
wrapper
::
slice
(
i
*
KPerBlock
,
(
i
+
1
)
*
KPerBlock
);
auto
a_global_tensor_k_slice
=
a_global_tensor
(
ck
::
wrapper
::
slice
(),
k_slice
);
auto
b_global_tensor_k_slice
=
b_global_tensor
(
ck
::
wrapper
::
slice
(),
k_slice
);
// Create local tiles for A and B.
auto
a_global_local_tile
=
ck
::
wrapper
::
make_local_tile
(
a_global_tensor_k_slice
,
tile_shape
,
block_idxs
,
make_tuple
(
ck
::
Number
<
1
>
{},
ck
::
wrapper
::
slice
(
N
),
ck
::
Number
<
1
>
{}));
auto
b_global_local_tile
=
ck
::
wrapper
::
make_local_tile
(
b_global_tensor_k_slice
,
tile_shape
,
block_idxs
,
make_tuple
(
ck
::
wrapper
::
slice
(
M
),
ck
::
Number
<
1
>
{},
ck
::
Number
<
1
>
{}));
// Copy from global to LDS.
ck
::
wrapper
::
blockwise_copy
<
DimAccessOrder
,
vector_dim
,
scalar_per_vector
>
(
a_global_local_tile
,
a_lds_tensor
,
thread_layout
);
ck
::
wrapper
::
blockwise_copy
<
DimAccessOrder
,
vector_dim
,
scalar_per_vector
>
(
b_global_local_tile
,
b_lds_tensor
,
thread_layout
);
// Synchronize lds.
ck
::
block_sync_lds
();
// Execute blockwise GEMM.
ck
::
wrapper
::
blockwise_gemm_xdl
<
DataType
,
ck
::
wrapper
::
size
(
thread_layout
),
GemmTraits
>
(
a_lds_tensor
,
b_lds_tensor
,
c_vgpr_reg
);
++
i
;
}
while
(
i
<
num_loop
);
```
Loop iterate over
`K / KPerBlock`
. Each time a local tile is created for A and B tensors (tensor per block),
data is copied from global memory to LDS. The
`blockwise_gemm`
function performs the GEMM
operation on
`a_lds_tensor`
and
`b_lds_tensor`
, and stores results in
`c_vgpr_reg`
.
The end result from
`c_vgpr_reg`
is stored in the
`C`
local partition (tensor per thread):
```
cpp
ck
::
wrapper
::
copy
(
c_vgpr_reg
,
c_global_local_partition
);
```
If you want to dive deep into the details, you can find the entire example
[
here
](
https://github.com/ROCm/composable_kernel/blob/develop/client_example/25_wrapper/wrapper_basic_gemm.cpp
)
.
test/wrapper/test
_gemm.cpp
→
client_example/25_wrapper/wrapper_basic
_gemm.cpp
View file @
6ac1d6a2
...
...
@@ -6,13 +6,9 @@
#include <iostream>
#include <initializer_list>
#include <vector>
#include <gtest/gtest.h>
#include "ck/library/utility/host_tensor.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
#include "ck/host_utility/kernel_launch.hpp"
#include "ck/library/utility/device_memory.hpp"
...
...
@@ -23,94 +19,88 @@
#include "ck/wrapper/tensor.hpp"
#include "ck/wrapper/operations/copy.hpp"
#include "ck/wrapper/operations/gemm.hpp"
#include "ck/wrapper/utils/kernel_utils.hpp"
template
<
typename
DataType
>
void
CheckResult
(
const
std
::
vector
<
DataType
>&
a_data
,
const
std
::
vector
<
DataType
>&
b_data
,
std
::
vector
<
DataType
>&
c_m_n_device_result
,
const
ck
::
index_t
M
,
const
ck
::
index_t
N
,
const
ck
::
index_t
K
)
struct
SimpleDeviceMem
{
using
PassThrough
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
ReferenceGemmInstance
=
ck
::
tensor_operation
::
host
::
ReferenceGemm
<
DataType
,
DataType
,
DataType
,
float
,
PassThrough
,
PassThrough
,
PassThrough
>
;
SimpleDeviceMem
()
=
delete
;
Tensor
<
DataType
>
a_m_k
(
HostTensorDescriptor
({
M
,
K
}));
Tensor
<
DataType
>
b_k_n
(
HostTensorDescriptor
({
K
,
N
},
{
1
,
K
}));
Tensor
<
DataType
>
c_m_n_host_result
(
HostTensorDescriptor
({
M
,
N
}));
SimpleDeviceMem
(
std
::
size_t
mem_size
)
:
p_mem_
{}
{
(
void
)
hipMalloc
(
static_cast
<
void
**>
(
&
p_mem_
),
mem_size
);
}
a_m_k
.
mData
=
a_data
;
b_k_n
.
mData
=
b_data
;
void
*
GetDeviceBuffer
()
{
return
p_mem_
;
}
auto
ref_op
=
ReferenceGemmInstance
{};
auto
ref_invoker
=
ref_op
.
MakeInvoker
();
auto
ref_argument
=
ref_op
.
MakeArgument
(
a_m_k
,
b_k_n
,
c_m_n_host_result
,
PassThrough
{},
PassThrough
{},
PassThrough
{});
~
SimpleDeviceMem
()
{
(
void
)
hipFree
(
p_mem_
);
}
ref_invoker
.
Run
(
ref_argument
);
EXPECT_TRUE
(
ck
::
utils
::
check_err
(
c_m_n_device_result
,
c_m_n_host_result
.
mData
));
}
void
*
p_mem_
;
};
template
<
typename
DataType
,
typename
GemmTraits
,
ck
::
index_t
scalar_per_vector
,
typename
BlockShape
,
typename
ThreadLayout
Shape
>
__global__
void
DeviceGemm
(
const
void
*
p_a
,
typename
ThreadLayout
>
__global__
void
__CK_WRAPPER_LAUNCH_BOUNDS__
DeviceGemm
(
const
void
*
p_a
,
const
void
*
p_b
,
void
*
p_c
,
const
ck
::
index_t
M
,
const
ck
::
index_t
N
,
const
ck
::
index_t
K
,
const
BlockShape
tile_shape
,
const
ThreadLayout
Shape
thread_layout
)
const
ThreadLayout
thread_layout
)
{
constexpr
auto
MPerBlock
=
ck
::
wrapper
::
size
<
0
>
(
tile_shape
);
constexpr
auto
NPerBlock
=
ck
::
wrapper
::
size
<
1
>
(
tile_shape
);
constexpr
auto
KPerBlock
=
ck
::
wrapper
::
size
<
2
>
(
tile_shape
);
// Specify layouts for global memory.
const
auto
a_global_layout
=
ck
::
wrapper
::
make_layout
(
ck
::
make_tuple
(
M
,
K
),
ck
::
make_tuple
(
K
,
1
));
const
auto
b_global_layout
=
ck
::
wrapper
::
make_layout
(
ck
::
make_tuple
(
N
,
K
),
ck
::
make_tuple
(
K
,
1
));
const
auto
c_global_layout
=
ck
::
wrapper
::
make_layout
(
ck
::
make_tuple
(
M
,
N
),
ck
::
make_tuple
(
N
,
1
));
// Specify layouts for tiles.
constexpr
auto
a_tile_layout
=
ck
::
wrapper
::
make_layout
(
ck
::
make_tuple
(
MPerBlock
,
KPerBlock
),
ck
::
make_tuple
(
KPerBlock
,
ck
::
Number
<
1
>
{}));
constexpr
auto
b_tile_layout
=
ck
::
wrapper
::
make_layout
(
ck
::
make_tuple
(
NPerBlock
,
KPerBlock
),
ck
::
make_tuple
(
KPerBlock
,
ck
::
Number
<
1
>
{}));
constexpr
auto
c_tile_layout
=
ck
::
wrapper
::
make_layout
(
ck
::
make_tuple
(
MPerBlock
,
NPerBlock
),
ck
::
make_tuple
(
NPerBlock
,
ck
::
Number
<
1
>
{}));
// Apply padding for global memory.
auto
a_global_layout_padded
=
ck
::
wrapper
::
pad
(
a_global_layout
,
shape
(
a_tile_layout
));
auto
b_global_layout_padded
=
ck
::
wrapper
::
pad
(
b_global_layout
,
shape
(
b_tile_layout
));
auto
c_global_layout_padded
=
ck
::
wrapper
::
pad
(
c_global_layout
,
shape
(
c_tile_layout
));
// Make tensors for global memory.
auto
a_global_tensor
=
ck
::
wrapper
::
make_tensor
<
ck
::
wrapper
::
MemoryTypeEnum
::
Global
>
(
static_cast
<
const
DataType
*>
(
p_a
),
a_global_layout
);
static_cast
<
const
DataType
*>
(
p_a
),
a_global_layout
_padded
);
auto
b_global_tensor
=
ck
::
wrapper
::
make_tensor
<
ck
::
wrapper
::
MemoryTypeEnum
::
Global
>
(
static_cast
<
const
DataType
*>
(
p_b
),
b_global_layout
);
static_cast
<
const
DataType
*>
(
p_b
),
b_global_layout
_padded
);
auto
c_global_tensor
=
ck
::
wrapper
::
make_tensor
<
ck
::
wrapper
::
MemoryTypeEnum
::
Global
>
(
static_cast
<
DataType
*>
(
p_c
),
c_global_layout
);
auto
a_padded_global_tensor
=
ck
::
wrapper
::
pad
(
a_global_tensor
,
shape
(
a_tile_layout
));
auto
b_padded_global_tensor
=
ck
::
wrapper
::
pad
(
b_global_tensor
,
shape
(
b_tile_layout
));
auto
c_padded_global_tensor
=
ck
::
wrapper
::
pad
(
c_global_tensor
,
shape
(
c_tile_layout
));
static_cast
<
DataType
*>
(
p_c
),
c_global_layout_padded
);
// Allocate lds memory.
__shared__
DataType
lds_a
[
ck
::
wrapper
::
size
(
a_tile_layout
)];
__shared__
DataType
lds_b
[
ck
::
wrapper
::
size
(
b_tile_layout
)];
// Make tensors for lds memory.
auto
a_lds_tensor
=
ck
::
wrapper
::
make_tensor
<
ck
::
wrapper
::
MemoryTypeEnum
::
Lds
>
(
static_cast
<
DataType
*>
(
lds_a
),
a_tile_layout
);
auto
b_lds_tensor
=
ck
::
wrapper
::
make_tensor
<
ck
::
wrapper
::
MemoryTypeEnum
::
Lds
>
(
static_cast
<
DataType
*>
(
lds_b
),
b_tile_layout
);
const
ck
::
index_t
block_idx
=
static_cast
<
ck
::
index_t
>
(
blockIdx
.
x
);
// Specify block index as tuple.
const
auto
block_idxs
=
ck
::
make_tuple
(
static_cast
<
ck
::
index_t
>
(
blockIdx
.
x
),
static_cast
<
ck
::
index_t
>
(
blockIdx
.
y
),
ck
::
wrapper
::
slice
());
// Specify access parameters for copy.
using
DimAccessOrder
=
ck
::
Tuple
<
ck
::
Number
<
0
>
,
ck
::
Number
<
1
>>
;
constexpr
ck
::
index_t
vector_dim
=
1
;
// Create tile and partition for C. Use specific function for blockwise_gemm to assign the
// appropriate partitions.
auto
c_global_local_tile
=
ck
::
wrapper
::
make_local_tile
(
c_
padded_
global_tensor
,
c_global_tensor
,
tile_shape
,
block_idx
,
block_idx
s
,
make_tuple
(
ck
::
Number
<
1
>
{},
ck
::
Number
<
1
>
{},
ck
::
wrapper
::
slice
(
KPerBlock
)));
auto
c_global_local_partition
=
ck
::
wrapper
::
make_blockwise_gemm_xdl_c_local_partition
<
DataType
,
...
...
@@ -118,42 +108,49 @@ __global__ void DeviceGemm(const void* p_a,
decltype
(
b_tile_layout
),
ck
::
wrapper
::
size
(
thread_layout
),
GemmTraits
>
(
c_global_local_tile
);
// Create C vgpr to accumulate results.
auto
c_vgpr_reg
=
ck
::
wrapper
::
make_blockwise_gemm_xdl_c_vgpr
<
DataType
,
decltype
(
a_tile_layout
),
decltype
(
b_tile_layout
),
ck
::
wrapper
::
size
(
thread_layout
),
GemmTraits
>
();
// Clear C vgpr.
ck
::
wrapper
::
clear
(
c_vgpr_reg
);
// Iterate over K with KPerBlock step.
const
ck
::
index_t
num_loop
=
ck
::
math
::
integer_divide_ceil
(
K
,
KPerBlock
);
ck
::
index_t
i
=
0
;
do
{
// Get KPerBlock slice.
const
auto
k_slice
=
ck
::
wrapper
::
slice
(
i
*
KPerBlock
,
(
i
+
1
)
*
KPerBlock
);
auto
a_padded_global_tensor_k_slice
=
a_padded_global_tensor
(
ck
::
wrapper
::
slice
(),
k_slice
);
auto
b_padded_global_tensor_k_slice
=
b_padded_global_tensor
(
ck
::
wrapper
::
slice
(),
k_slice
);
auto
a_global_tensor_k_slice
=
a_global_tensor
(
ck
::
wrapper
::
slice
(),
k_slice
);
auto
b_global_tensor_k_slice
=
b_global_tensor
(
ck
::
wrapper
::
slice
(),
k_slice
);
// Create local tiles for A and B.
auto
a_global_local_tile
=
ck
::
wrapper
::
make_local_tile
(
a_
padded_
global_tensor_k_slice
,
a_global_tensor_k_slice
,
tile_shape
,
block_idx
,
block_idx
s
,
make_tuple
(
ck
::
Number
<
1
>
{},
ck
::
wrapper
::
slice
(
N
),
ck
::
Number
<
1
>
{}));
auto
b_global_local_tile
=
ck
::
wrapper
::
make_local_tile
(
b_
padded_
global_tensor_k_slice
,
b_global_tensor_k_slice
,
tile_shape
,
block_idx
,
block_idx
s
,
make_tuple
(
ck
::
wrapper
::
slice
(
M
),
ck
::
Number
<
1
>
{},
ck
::
Number
<
1
>
{}));
// Copy from global to lds.
ck
::
wrapper
::
blockwise_copy
<
DimAccessOrder
,
vector_dim
,
scalar_per_vector
>
(
a_global_local_tile
,
a_lds_tensor
,
thread_layout
);
ck
::
wrapper
::
blockwise_copy
<
DimAccessOrder
,
vector_dim
,
scalar_per_vector
>
(
b_global_local_tile
,
b_lds_tensor
,
thread_layout
);
// Synchronize lds.
ck
::
block_sync_lds
();
// Execute blockwise gemm.
ck
::
wrapper
::
blockwise_gemm_xdl
<
DataType
,
ck
::
wrapper
::
size
(
thread_layout
),
GemmTraits
>
(
a_lds_tensor
,
b_lds_tensor
,
c_vgpr_reg
);
++
i
;
}
while
(
i
<
num_loop
);
// Copy vgpr results to C global memory.
ck
::
wrapper
::
copy
(
c_vgpr_reg
,
c_global_local_partition
);
}
...
...
@@ -161,36 +158,28 @@ template <typename DataType,
typename
GemmTraits
,
ck
::
index_t
scalar_per_vector
,
typename
BlockShape
,
typename
ThreadLayout
Shape
>
typename
ThreadLayout
>
void
PerformGemm
(
const
ck
::
index_t
M
,
const
ck
::
index_t
N
,
const
ck
::
index_t
K
,
const
BlockShape
&
tile_shape
,
const
ThreadLayout
Shape
&
thread_layout
)
const
ThreadLayout
&
thread_layout
)
{
// Global memory buffers
DeviceMem
a_mem
(
M
*
K
*
sizeof
(
DataType
));
DeviceMem
b_mem
(
K
*
N
*
sizeof
(
DataType
));
DeviceMem
c_mem
(
M
*
N
*
sizeof
(
DataType
));
std
::
vector
<
DataType
>
a_data
(
M
*
K
);
std
::
vector
<
DataType
>
b_data
(
K
*
N
);
ck
::
utils
::
FillUniformDistributionIntegerValue
<
DataType
>
{
-
5.
f
,
5.
f
}(
a_data
);
ck
::
utils
::
FillUniformDistributionIntegerValue
<
DataType
>
{
-
5.
f
,
5.
f
}(
b_data
);
SimpleDeviceMem
a_mem
(
M
*
K
*
sizeof
(
DataType
));
SimpleDeviceMem
b_mem
(
K
*
N
*
sizeof
(
DataType
));
SimpleDeviceMem
c_mem
(
M
*
N
*
sizeof
(
DataType
));
a_mem
.
ToDevice
(
a_data
.
data
());
b_mem
.
ToDevice
(
b_data
.
data
());
c_mem
.
SetZero
();
const
ck
::
index_t
grid_size
=
ck
::
math
::
integer_divide_ceil
(
M
,
ck
::
wrapper
::
size
<
0
>
(
tile_shape
))
*
const
ck
::
index_t
grid_size_x
=
ck
::
math
::
integer_divide_ceil
(
M
,
ck
::
wrapper
::
size
<
0
>
(
tile_shape
));
const
ck
::
index_t
grid_size_y
=
ck
::
math
::
integer_divide_ceil
(
N
,
ck
::
wrapper
::
size
<
1
>
(
tile_shape
));
const
auto
kernel
=
DeviceGemm
<
DataType
,
GemmTraits
,
scalar_per_vector
,
BlockShape
,
ThreadLayout
Shape
>
;
launch_and_time_kernel
(
StreamConfig
{
nullptr
},
DeviceGemm
<
DataType
,
GemmTraits
,
scalar_per_vector
,
BlockShape
,
ThreadLayout
>
;
const
float
avg_time
=
launch_and_time_kernel
(
StreamConfig
{
nullptr
,
true
},
kernel
,
dim3
(
grid_size
),
dim3
(
grid_size_x
,
grid_size_y
,
1
),
dim3
(
ck
::
wrapper
::
size
(
thread_layout
)),
0
,
a_mem
.
GetDeviceBuffer
(),
...
...
@@ -202,56 +191,25 @@ void PerformGemm(const ck::index_t M,
tile_shape
,
thread_layout
);
std
::
vector
<
DataType
>
c_data
(
M
*
N
);
c_mem
.
FromDevice
(
c_data
.
data
());
CheckResult
<
DataType
>
(
a_data
,
b_data
,
c_data
,
M
,
N
,
K
);
}
std
::
size_t
flop
=
std
::
size_t
(
2
)
*
M
*
N
*
K
;
std
::
size_t
num_btype
=
sizeof
(
DataType
)
*
M
*
K
+
sizeof
(
DataType
)
*
K
*
N
+
sizeof
(
DataType
)
*
M
*
N
;
TEST
(
TestGemm
,
Float
)
{
using
DataType
=
float
;
const
auto
thread_layout
=
ck
::
make_tuple
(
ck
::
Number
<
16
>
{},
ck
::
Number
<
16
>
{});
const
auto
tile_shape
=
ck
::
make_tuple
(
ck
::
Number
<
128
>
{},
ck
::
Number
<
128
>
{},
ck
::
Number
<
64
>
{});
PerformGemm
<
DataType
,
ck
::
wrapper
::
BlockwisGemmXdlTraits_32x32Xdl_2x2XdlPerWave_4K1
,
4
>
(
512
,
512
,
128
,
tile_shape
,
thread_layout
);
// Irregular case
PerformGemm
<
DataType
,
ck
::
wrapper
::
BlockwisGemmXdlTraits_32x32Xdl_2x2XdlPerWave_4K1
,
1
>
(
129
,
129
,
67
,
tile_shape
,
thread_layout
);
}
float
tflops
=
static_cast
<
float
>
(
flop
)
/
1.E9
/
avg_time
;
float
gb_per_sec
=
num_btype
/
1.E6
/
avg_time
;
TEST
(
TestGemm
,
Int8
)
{
using
DataType
=
int8_t
;
const
auto
thread_layout
=
ck
::
make_tuple
(
ck
::
Number
<
64
>
{},
ck
::
Number
<
4
>
{});
const
auto
tile_shape
=
ck
::
make_tuple
(
ck
::
Number
<
128
>
{},
ck
::
Number
<
128
>
{},
ck
::
Number
<
64
>
{});
PerformGemm
<
DataType
,
ck
::
wrapper
::
BlockwisGemmXdlTraits_32x32Xdl_2x2XdlPerWave_16K1
,
16
>
(
512
,
512
,
128
,
tile_shape
,
thread_layout
);
// Irregular case
PerformGemm
<
DataType
,
ck
::
wrapper
::
BlockwisGemmXdlTraits_32x32Xdl_2x2XdlPerWave_16K1
,
1
>
(
129
,
129
,
67
,
tile_shape
,
thread_layout
);
std
::
cout
<<
"Perf: "
<<
std
::
setw
(
10
)
<<
avg_time
<<
" ms, "
<<
tflops
<<
" TFlops, "
<<
gb_per_sec
<<
" GB/s, "
<<
std
::
endl
;
}
TEST
(
TestGemm
,
Half
)
int
main
(
int
argc
,
char
*
argv
[]
)
{
using
DataType
=
ck
::
half_t
;
const
auto
thread_layout
=
ck
::
make_tuple
(
ck
::
Number
<
32
>
{},
ck
::
Number
<
8
>
{});
const
auto
tile_shape
=
ck
::
make_tuple
(
ck
::
Number
<
128
>
{},
ck
::
Number
<
128
>
{},
ck
::
Number
<
64
>
{});
PerformGemm
<
DataType
,
ck
::
wrapper
::
BlockwisGemmXdlTraits_32x32Xdl_2x2XdlPerWave_8K1
,
8
>
(
512
,
512
,
128
,
tile_shape
,
thread_layout
);
// Irregular case
PerformGemm
<
DataType
,
ck
::
wrapper
::
BlockwisGemmXdlTraits_32x32Xdl_2x2XdlPerWave_8K1
,
1
>
(
129
,
129
,
67
,
tile_shape
,
thread_layout
);
}
TEST
(
TestGemm
,
Float_2x4_4x2_XdlPerWave
)
{
using
DataType
=
float
;
const
auto
thread_layout_4x2_xdl_per_wave
=
ck
::
make_tuple
(
ck
::
Number
<
16
>
{},
ck
::
Number
<
8
>
{});
const
auto
thread_layout_2x4_xdl_per_wave
=
ck
::
make_tuple
(
ck
::
Number
<
8
>
{},
ck
::
Number
<
16
>
{});
const
auto
tile_shape
=
ck
::
make_tuple
(
ck
::
Number
<
128
>
{},
ck
::
Number
<
128
>
{},
ck
::
Number
<
64
>
{});
PerformGemm
<
DataType
,
ck
::
wrapper
::
BlockwisGemmXdlTraits_32x32Xdl_4x2XdlPerWave_4K1
,
4
>
(
512
,
512
,
128
,
tile_shape
,
thread_layout_4x2_xdl_per_wave
);
PerformGemm
<
DataType
,
ck
::
wrapper
::
BlockwisGemmXdlTraits_32x32Xdl_2x4XdlPerWave_4K1
,
4
>
(
512
,
512
,
128
,
tile_shape
,
thread_layout_2x4_xdl_per_wave
);
const
auto
thread_layout
=
ck
::
wrapper
::
make_layout
(
ck
::
make_tuple
(
ck
::
Number
<
64
>
{},
ck
::
Number
<
4
>
{}),
ck
::
make_tuple
(
ck
::
Number
<
4
>
{},
ck
::
Number
<
1
>
{}));
const
auto
tile_shape
=
ck
::
make_tuple
(
ck
::
Number
<
256
>
{},
ck
::
Number
<
128
>
{},
ck
::
Number
<
32
>
{});
PerformGemm
<
DataType
,
ck
::
wrapper
::
BlockwisGemmXdlTraits_32x32Xdl_4x2XdlPerWave_8K1
,
8
>
(
3840
,
4096
,
4096
,
tile_shape
,
thread_layout
);
return
0
;
}
client_example/25_wrapper/wrapper_img2col.cpp
View file @
6ac1d6a2
...
...
@@ -15,6 +15,7 @@
#include "ck/wrapper/layout.hpp"
#include "ck/wrapper/tensor.hpp"
#include "ck/wrapper/operations/copy.hpp"
#include "ck/wrapper/utils/kernel_utils.hpp"
static
constexpr
ck
::
index_t
NumDimSpatial
=
3
;
using
DataType
=
float
;
...
...
@@ -36,21 +37,20 @@ struct SimpleDeviceMem
void
*
p_mem_
;
};
// Test copy from Global to Global through LDS and VGPR
template
<
typename
InputTensor
,
typename
OutputTensor
,
typename
BlockShape
,
typename
ThreadLayoutShape
>
__global__
void
DeviceImageToColumnPad0
(
InputTensor
input_tensor
,
template
<
typename
InputTensor
,
typename
OutputTensor
,
typename
BlockShape
,
typename
ThreadLayout
>
__global__
void
__CK_WRAPPER_LAUNCH_BOUNDS__
DeviceImageToColumnPad0
(
InputTensor
input_tensor
,
OutputTensor
output_tensor
,
const
BlockShape
tile_shape
,
const
ThreadLayout
Shape
thread_layout
)
const
ThreadLayout
thread_layout
)
{
const
ck
::
index_t
block_idx
=
static_cast
<
ck
::
index_t
>
(
blockIdx
.
x
);
// grid layout (dim1, dim0)
const
auto
block_idxs
=
ck
::
make_tuple
(
static_cast
<
ck
::
index_t
>
(
blockIdx
.
y
),
static_cast
<
ck
::
index_t
>
(
blockIdx
.
x
));
// Get local tiles for global memory
auto
input_local_tile
=
ck
::
wrapper
::
make_local_tile
(
input_tensor
,
tile_shape
,
block_idx
);
auto
output_local_tile
=
ck
::
wrapper
::
make_local_tile
(
output_tensor
,
tile_shape
,
block_idx
);
auto
input_local_tile
=
ck
::
wrapper
::
make_local_tile
(
input_tensor
,
tile_shape
,
block_idx
s
);
auto
output_local_tile
=
ck
::
wrapper
::
make_local_tile
(
output_tensor
,
tile_shape
,
block_idx
s
);
// Get partition per thread
const
auto
input_local_partition
=
...
...
@@ -112,9 +112,11 @@ void PerformImageToColumnPad0(const ck::index_t G,
SimpleDeviceMem
out_buf
(
ck
::
wrapper
::
size
(
out_layout
)
*
sizeof
(
DataType
));
// User can choose appropriate number of threads and sizes per block
const
auto
thread_layout
=
ck
::
make_tuple
(
ck
::
Number
<
8
>
{},
ck
::
Number
<
16
>
{});
const
auto
thread_layout
=
ck
::
wrapper
::
make_layout
(
ck
::
make_tuple
(
ck
::
Number
<
8
>
{},
ck
::
Number
<
16
>
{}),
ck
::
make_tuple
(
ck
::
Number
<
16
>
{},
ck
::
Number
<
1
>
{}));
// This example doesn't support padding, user should select tile sizes
// which
divides the shape completely
// which
are divisible by the shape.
const
auto
tile_shape
=
ck
::
make_tuple
(
ck
::
Number
<
32
>
{},
ck
::
Number
<
64
>
{});
// Create buffers for global memory
...
...
@@ -123,10 +125,11 @@ void PerformImageToColumnPad0(const ck::index_t G,
auto
output_tensor_global
=
ck
::
wrapper
::
make_tensor
<
ck
::
wrapper
::
MemoryTypeEnum
::
Global
>
(
static_cast
<
DataType
*>
(
out_buf
.
GetDeviceBuffer
()),
out_layout
);
const
ck
::
index_t
grid_size
=
ck
::
math
::
integer_divide_ceil
(
ck
::
wrapper
::
size
<
0
>
(
in_layout
),
ck
::
wrapper
::
size
<
0
>
(
tile_shape
))
*
ck
::
math
::
integer_divide_ceil
(
ck
::
wrapper
::
size
<
1
>
(
in_layout
),
// grid layout (dim1, dim0)
const
ck
::
index_t
grid_size_x
=
ck
::
math
::
integer_divide_ceil
(
ck
::
wrapper
::
size
<
1
>
(
in_layout
),
ck
::
wrapper
::
size
<
1
>
(
tile_shape
));
const
ck
::
index_t
grid_size_y
=
ck
::
math
::
integer_divide_ceil
(
ck
::
wrapper
::
size
<
0
>
(
in_layout
),
ck
::
wrapper
::
size
<
0
>
(
tile_shape
));
const
auto
kernel
=
DeviceImageToColumnPad0
<
decltype
(
input_tensor_global
),
decltype
(
output_tensor_global
),
...
...
@@ -134,7 +137,7 @@ void PerformImageToColumnPad0(const ck::index_t G,
decltype
(
thread_layout
)
>
;
const
float
avg_time
=
launch_and_time_kernel
(
StreamConfig
{
nullptr
,
true
},
kernel
,
dim3
(
grid_size
),
dim3
(
grid_size
_x
,
grid_size_y
,
1
),
dim3
(
ck
::
wrapper
::
size
(
thread_layout
)),
0
,
input_tensor_global
,
...
...
@@ -178,3 +181,4 @@ int main(int argc, char* argv[])
{
1
,
1
,
1
}
/*filter_dilations*/
);
return
0
;
}
// MI100 Perf: 0.255178 ms, 1698.9 GB/s,
client_example/25_wrapper/wrapper_optimized_gemm.cpp
0 → 100644
View file @
6ac1d6a2
// SPDX-License-Identifier: MIT
// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
#include <numeric>
#include <cstdlib>
#include <iostream>
#include <initializer_list>
#include <vector>
#include "ck/library/utility/host_tensor.hpp"
#include "ck/host_utility/kernel_launch.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/utility/common_header.hpp"
#include "ck/library/utility/fill.hpp"
#include "ck/wrapper/layout.hpp"
#include "ck/wrapper/tensor.hpp"
#include "ck/wrapper/operations/copy.hpp"
#include "ck/wrapper/operations/gemm.hpp"
#include "ck/wrapper/utils/kernel_utils.hpp"
struct
SimpleDeviceMem
{
SimpleDeviceMem
()
=
delete
;
SimpleDeviceMem
(
std
::
size_t
mem_size
)
:
p_mem_
{}
{
(
void
)
hipMalloc
(
static_cast
<
void
**>
(
&
p_mem_
),
mem_size
);
}
void
*
GetDeviceBuffer
()
{
return
p_mem_
;
}
~
SimpleDeviceMem
()
{
(
void
)
hipFree
(
p_mem_
);
}
void
*
p_mem_
;
};
template
<
bool
DoPad
,
typename
Layout
,
typename
PaddingDims
>
__device__
auto
ApplyPadding
(
const
Layout
&
layout
,
const
PaddingDims
&
padding_dims
)
{
if
constexpr
(
DoPad
)
{
return
ck
::
wrapper
::
pad
(
layout
,
padding_dims
);
}
else
{
return
layout
;
}
}
template
<
typename
DataType
,
typename
GemmTraits
,
ck
::
index_t
scalar_per_vector
,
typename
BlockShape
,
typename
ThreadLayout
,
bool
DoPadding
>
__global__
void
__CK_WRAPPER_LAUNCH_BOUNDS__
DeviceGemm
(
const
void
*
p_a
,
const
void
*
p_b
,
void
*
p_c
,
const
ck
::
index_t
M
,
const
ck
::
index_t
N
,
const
ck
::
index_t
K
,
const
BlockShape
tile_shape
,
const
ThreadLayout
thread_layout
)
{
constexpr
auto
MPerBlock
=
ck
::
wrapper
::
size
<
0
>
(
tile_shape
);
constexpr
auto
NPerBlock
=
ck
::
wrapper
::
size
<
1
>
(
tile_shape
);
constexpr
auto
KPerBlock
=
ck
::
wrapper
::
size
<
2
>
(
tile_shape
);
constexpr
auto
K1
=
GemmTraits
::
K1
;
constexpr
auto
K0PerBlock
=
KPerBlock
/
K1
;
const
auto
K0
=
ck
::
math
::
integer_divide_ceil
(
K
,
K1
);
const
auto
tile_shape_k0_m_n_k1
=
ck
::
make_tuple
(
K0PerBlock
,
MPerBlock
,
NPerBlock
,
K1
);
// Create layouts for global memory
const
auto
a_global_layout
=
ck
::
wrapper
::
make_layout
(
ck
::
make_tuple
(
M
,
K
),
ck
::
make_tuple
(
K
,
1
));
const
auto
b_global_layout
=
ck
::
wrapper
::
make_layout
(
ck
::
make_tuple
(
N
,
K
),
ck
::
make_tuple
(
K
,
1
));
const
auto
c_global_layout
=
ck
::
wrapper
::
make_layout
(
ck
::
make_tuple
(
M
,
N
),
ck
::
make_tuple
(
N
,
1
));
// Apply padding
auto
a_padded_global_layout
=
ApplyPadding
<
DoPadding
>
(
a_global_layout
,
ck
::
make_tuple
(
MPerBlock
,
KPerBlock
));
auto
b_padded_global_layout
=
ApplyPadding
<
DoPadding
>
(
b_global_layout
,
ck
::
make_tuple
(
NPerBlock
,
KPerBlock
));
auto
c_padded_global_layout
=
ApplyPadding
<
DoPadding
>
(
c_global_layout
,
ck
::
make_tuple
(
MPerBlock
,
NPerBlock
));
// Reshape from M,K to K0,M,K1
const
auto
reshaped_dims_idxs
=
ck
::
make_tuple
(
ck
::
Number
<
1
>
{},
ck
::
make_tuple
(
ck
::
Number
<
0
>
{},
ck
::
Number
<
2
>
{}));
auto
a_padded_unmerged_global_layout
=
ck
::
wrapper
::
unmerge
<
1
>
(
a_padded_global_layout
,
ck
::
make_tuple
(
K0
,
K1
),
reshaped_dims_idxs
);
auto
b_padded_unmerged_global_layout
=
ck
::
wrapper
::
unmerge
<
1
>
(
b_padded_global_layout
,
ck
::
make_tuple
(
K0
,
K1
),
reshaped_dims_idxs
);
// Create tensors for global memory
auto
a_global_tensor
=
ck
::
wrapper
::
make_tensor
<
ck
::
wrapper
::
MemoryTypeEnum
::
Global
>
(
static_cast
<
const
DataType
*>
(
p_a
),
a_padded_unmerged_global_layout
);
auto
b_global_tensor
=
ck
::
wrapper
::
make_tensor
<
ck
::
wrapper
::
MemoryTypeEnum
::
Global
>
(
static_cast
<
const
DataType
*>
(
p_b
),
b_padded_unmerged_global_layout
);
auto
c_global_tensor
=
ck
::
wrapper
::
make_tensor
<
ck
::
wrapper
::
MemoryTypeEnum
::
Global
>
(
static_cast
<
DataType
*>
(
p_c
),
c_padded_global_layout
);
// Create layouts and tensors for lds memory.
constexpr
auto
a_tile_layout
=
ck
::
wrapper
::
make_layout
(
ck
::
make_tuple
(
K0PerBlock
,
MPerBlock
,
K1
),
ck
::
make_tuple
((
MPerBlock
+
ck
::
Number
<
1
>
{})
*
K1
,
K1
,
ck
::
Number
<
1
>
{}));
constexpr
auto
b_tile_layout
=
ck
::
wrapper
::
make_layout
(
ck
::
make_tuple
(
K0PerBlock
,
NPerBlock
,
K1
),
ck
::
make_tuple
((
NPerBlock
+
ck
::
Number
<
1
>
{})
*
K1
,
K1
,
ck
::
Number
<
1
>
{}));
__shared__
DataType
lds_a
[
ck
::
wrapper
::
size
(
a_tile_layout
)
+
K0PerBlock
];
__shared__
DataType
lds_b
[
ck
::
wrapper
::
size
(
b_tile_layout
)
+
K0PerBlock
];
auto
a_lds_tensor
=
ck
::
wrapper
::
make_tensor
<
ck
::
wrapper
::
MemoryTypeEnum
::
Lds
>
(
static_cast
<
DataType
*>
(
lds_a
),
a_tile_layout
);
auto
b_lds_tensor
=
ck
::
wrapper
::
make_tensor
<
ck
::
wrapper
::
MemoryTypeEnum
::
Lds
>
(
static_cast
<
DataType
*>
(
lds_b
),
b_tile_layout
);
const
auto
block_idxs
=
ck
::
make_tuple
(
ck
::
wrapper
::
slice
(),
static_cast
<
ck
::
index_t
>
(
blockIdx
.
x
),
static_cast
<
ck
::
index_t
>
(
blockIdx
.
y
),
ck
::
wrapper
::
slice
());
using
DimAccessOrder
=
ck
::
Tuple
<
ck
::
Number
<
1
>
,
ck
::
Number
<
0
>
,
ck
::
Number
<
2
>>
;
constexpr
ck
::
index_t
vector_dim
=
2
;
// Create tile and partition for C global memory. Use specific gemm
// functions to get appropriate layouts.
auto
c_global_local_tile
=
ck
::
wrapper
::
make_local_tile
(
c_global_tensor
,
tile_shape_k0_m_n_k1
,
block_idxs
,
make_tuple
(
ck
::
wrapper
::
slice
(
K0PerBlock
),
ck
::
Number
<
1
>
{},
ck
::
Number
<
1
>
{},
ck
::
wrapper
::
slice
(
K1
)));
auto
c_global_local_partition
=
ck
::
wrapper
::
make_blockwise_gemm_xdl_c_local_partition
<
DataType
,
decltype
(
a_tile_layout
),
decltype
(
b_tile_layout
),
ck
::
wrapper
::
size
(
thread_layout
),
GemmTraits
>
(
c_global_local_tile
);
// Define and clear c vgpr register
auto
c_vgpr_reg
=
ck
::
wrapper
::
make_blockwise_gemm_xdl_c_vgpr
<
DataType
,
decltype
(
a_tile_layout
),
decltype
(
b_tile_layout
),
ck
::
wrapper
::
size
(
thread_layout
),
GemmTraits
>
();
ck
::
wrapper
::
clear
(
c_vgpr_reg
);
// Local partitions for lds memory
auto
a_lds_tensor_local_partition
=
ck
::
wrapper
::
make_local_partition
(
a_lds_tensor
,
thread_layout
,
threadIdx
.
x
);
auto
b_lds_tensor_local_partition
=
ck
::
wrapper
::
make_local_partition
(
b_lds_tensor
,
thread_layout
,
threadIdx
.
x
);
// Lamda to slice tensor, then create local tile and partition
auto
make_global_partition
=
[
&
](
auto
tensor
,
auto
projection
,
ck
::
index_t
i
)
{
const
auto
k_slice
=
ck
::
make_tuple
(
ck
::
wrapper
::
slice
(
i
*
K0PerBlock
,
(
i
+
1
)
*
K0PerBlock
),
ck
::
wrapper
::
slice
(),
ck
::
wrapper
::
slice
());
auto
local_tile
=
ck
::
wrapper
::
make_local_tile
(
tensor
(
k_slice
),
tile_shape_k0_m_n_k1
,
block_idxs
,
projection
);
return
ck
::
wrapper
::
make_local_partition
(
local_tile
,
thread_layout
,
threadIdx
.
x
);
};
auto
a_global_local_partition
=
make_global_partition
(
a_global_tensor
,
make_tuple
(
ck
::
Number
<
1
>
{},
ck
::
Number
<
1
>
{},
ck
::
wrapper
::
slice
(
N
),
ck
::
Number
<
1
>
{}),
0
);
auto
b_global_local_partition
=
make_global_partition
(
b_global_tensor
,
make_tuple
(
ck
::
Number
<
1
>
{},
ck
::
wrapper
::
slice
(
M
),
ck
::
Number
<
1
>
{},
ck
::
Number
<
1
>
{}),
0
);
// (row-major vgpr layout)
auto
a_vgpr_tensor
=
ck
::
wrapper
::
make_register_tensor
<
ck
::
wrapper
::
MemoryTypeEnum
::
Vgpr
,
DataType
>
(
ck
::
wrapper
::
make_layout
(
shape
(
a_global_local_partition
),
ck
::
make_tuple
(
ck
::
wrapper
::
size
<
1
>
(
a_global_local_partition
)
*
ck
::
wrapper
::
size
<
2
>
(
a_global_local_partition
),
ck
::
wrapper
::
size
<
2
>
(
a_global_local_partition
),
ck
::
Number
<
1
>
{})));
auto
b_vgpr_tensor
=
ck
::
wrapper
::
make_register_tensor
<
ck
::
wrapper
::
MemoryTypeEnum
::
Vgpr
,
DataType
>
(
ck
::
wrapper
::
make_layout
(
shape
(
b_global_local_partition
),
ck
::
make_tuple
(
ck
::
wrapper
::
size
<
1
>
(
a_global_local_partition
)
*
ck
::
wrapper
::
size
<
2
>
(
a_global_local_partition
),
ck
::
wrapper
::
size
<
2
>
(
a_global_local_partition
),
ck
::
Number
<
1
>
{})));
// Copy first values to lds
ck
::
wrapper
::
copy
<
DimAccessOrder
,
vector_dim
,
scalar_per_vector
>
(
a_global_local_partition
,
a_vgpr_tensor
);
ck
::
wrapper
::
copy
<
DimAccessOrder
,
vector_dim
,
scalar_per_vector
>
(
b_global_local_partition
,
b_vgpr_tensor
);
ck
::
wrapper
::
copy
<
DimAccessOrder
,
vector_dim
,
scalar_per_vector
>
(
a_vgpr_tensor
,
a_lds_tensor_local_partition
);
ck
::
wrapper
::
copy
<
DimAccessOrder
,
vector_dim
,
scalar_per_vector
>
(
b_vgpr_tensor
,
b_lds_tensor_local_partition
);
// Pipeline loop
const
ck
::
index_t
num_loop
=
__builtin_amdgcn_readfirstlane
(
ck
::
math
::
integer_divide_ceil
(
K
,
KPerBlock
));
// Skip if only tile should be processed
if
(
num_loop
>
1
)
{
ck
::
index_t
i
=
0
;
do
{
auto
a_global_local_partition_i
=
make_global_partition
(
a_global_tensor
,
make_tuple
(
ck
::
Number
<
1
>
{},
ck
::
Number
<
1
>
{},
ck
::
wrapper
::
slice
(
N
),
ck
::
Number
<
1
>
{}),
i
+
1
);
auto
b_global_local_partition_i
=
make_global_partition
(
b_global_tensor
,
make_tuple
(
ck
::
Number
<
1
>
{},
ck
::
wrapper
::
slice
(
M
),
ck
::
Number
<
1
>
{},
ck
::
Number
<
1
>
{}),
i
+
1
);
// Copy data to A vgpr.
ck
::
wrapper
::
copy
<
DimAccessOrder
,
vector_dim
,
scalar_per_vector
>
(
a_global_local_partition_i
,
a_vgpr_tensor
);
// Synchronize.
ck
::
block_sync_lds
();
// Copy data to B vgpr.
ck
::
wrapper
::
copy
<
DimAccessOrder
,
vector_dim
,
scalar_per_vector
>
(
b_global_local_partition_i
,
b_vgpr_tensor
);
// Perform gemm.
ck
::
wrapper
::
blockwise_gemm_xdl
<
DataType
,
ck
::
wrapper
::
size
(
thread_layout
),
GemmTraits
>
(
a_lds_tensor
,
b_lds_tensor
,
c_vgpr_reg
);
// Synchronize
ck
::
block_sync_lds
();
// Copy data to A and B lds tiles.
ck
::
wrapper
::
copy
<
DimAccessOrder
,
vector_dim
,
scalar_per_vector
>
(
a_vgpr_tensor
,
a_lds_tensor_local_partition
);
ck
::
wrapper
::
copy
<
DimAccessOrder
,
vector_dim
,
scalar_per_vector
>
(
b_vgpr_tensor
,
b_lds_tensor_local_partition
);
++
i
;
}
while
(
i
<
(
num_loop
-
1
));
}
// Handle tail.
ck
::
block_sync_lds
();
ck
::
wrapper
::
blockwise_gemm_xdl
<
DataType
,
ck
::
wrapper
::
size
(
thread_layout
),
GemmTraits
>
(
a_lds_tensor
,
b_lds_tensor
,
c_vgpr_reg
);
// Store data from C vgpr to C global memory.
ck
::
wrapper
::
copy
(
c_vgpr_reg
,
c_global_local_partition
);
}
template
<
typename
DataType
,
typename
GemmTraits
,
ck
::
index_t
scalar_per_vector
,
bool
DoPadding
,
typename
BlockShape
,
typename
ThreadLayout
>
void
PerformGemm
(
const
ck
::
index_t
M
,
const
ck
::
index_t
N
,
const
ck
::
index_t
K
,
const
BlockShape
&
tile_shape
,
const
ThreadLayout
&
thread_layout
)
{
// Global memory buffers
SimpleDeviceMem
a_mem
(
M
*
K
*
sizeof
(
DataType
));
SimpleDeviceMem
b_mem
(
K
*
N
*
sizeof
(
DataType
));
SimpleDeviceMem
c_mem
(
M
*
N
*
sizeof
(
DataType
));
const
ck
::
index_t
grid_size_x
=
ck
::
math
::
integer_divide_ceil
(
M
,
ck
::
wrapper
::
size
<
0
>
(
tile_shape
));
const
ck
::
index_t
grid_size_y
=
ck
::
math
::
integer_divide_ceil
(
N
,
ck
::
wrapper
::
size
<
1
>
(
tile_shape
));
const
auto
kernel
=
DeviceGemm
<
DataType
,
GemmTraits
,
scalar_per_vector
,
BlockShape
,
ThreadLayout
,
DoPadding
>
;
const
float
avg_time
=
launch_and_time_kernel
(
StreamConfig
{
nullptr
,
true
},
kernel
,
dim3
(
grid_size_x
,
grid_size_y
,
1
),
dim3
(
ck
::
wrapper
::
size
(
thread_layout
)),
0
,
a_mem
.
GetDeviceBuffer
(),
b_mem
.
GetDeviceBuffer
(),
c_mem
.
GetDeviceBuffer
(),
M
,
N
,
K
,
tile_shape
,
thread_layout
);
std
::
size_t
flop
=
std
::
size_t
(
2
)
*
M
*
N
*
K
;
std
::
size_t
num_btype
=
sizeof
(
DataType
)
*
M
*
K
+
sizeof
(
DataType
)
*
K
*
N
+
sizeof
(
DataType
)
*
M
*
N
;
float
tflops
=
static_cast
<
float
>
(
flop
)
/
1.E9
/
avg_time
;
float
gb_per_sec
=
num_btype
/
1.E6
/
avg_time
;
std
::
cout
<<
"Perf: "
<<
std
::
setw
(
10
)
<<
avg_time
<<
" ms, "
<<
tflops
<<
" TFlops, "
<<
gb_per_sec
<<
" GB/s, "
<<
std
::
endl
;
}
int
main
(
int
argc
,
char
*
argv
[])
{
using
DataType
=
ck
::
half_t
;
const
auto
thread_layout
=
ck
::
wrapper
::
make_layout
(
ck
::
make_tuple
(
ck
::
Number
<
4
>
{},
ck
::
Number
<
64
>
{},
ck
::
Number
<
1
>
{}),
ck
::
make_tuple
(
ck
::
Number
<
1
>
{},
ck
::
Number
<
4
>
{},
ck
::
Number
<
1
>
{}));
const
auto
tile_shape
=
ck
::
make_tuple
(
ck
::
Number
<
256
>
{},
ck
::
Number
<
128
>
{},
ck
::
Number
<
32
>
{});
PerformGemm
<
DataType
,
ck
::
wrapper
::
BlockwisGemmXdlTraits_32x32Xdl_4x2XdlPerWave_8K1
,
8
,
false
>
(
3840
,
4096
,
4096
,
tile_shape
,
thread_layout
);
return
0
;
}
cmake/Embed.cmake
0 → 100644
View file @
6ac1d6a2
#####################################################################################
# The MIT License (MIT)
#
# Copyright (c) 2015-2024 Advanced Micro Devices, Inc. All rights reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE.
#####################################################################################
if
(
WIN32
)
set
(
EMBED_USE RC CACHE STRING
"Use RC or CArrays to embed data files"
)
set_property
(
CACHE EMBED_USE PROPERTY STRINGS
"RC;CArrays"
)
else
()
if
(
BUILD_SHARED_LIBS
)
set
(
EMBED_USE LD CACHE STRING
"Use LD or CArrays to embed data files"
)
else
()
set
(
EMBED_USE CArrays CACHE STRING
"Use LD or CArrays to embed data files"
)
endif
()
set_property
(
CACHE EMBED_USE PROPERTY STRINGS
"LD;CArrays"
)
endif
()
if
(
EMBED_USE STREQUAL
"LD"
)
find_program
(
EMBED_LD ld REQUIRED
)
find_program
(
EMBED_OBJCOPY objcopy REQUIRED
)
endif
()
function
(
embed_wrap_string
)
set
(
options
)
set
(
oneValueArgs VARIABLE AT_COLUMN
)
set
(
multiValueArgs
)
cmake_parse_arguments
(
PARSE
"
${
options
}
"
"
${
oneValueArgs
}
"
"
${
multiValueArgs
}
"
${
ARGN
}
)
string
(
LENGTH
${${
PARSE_VARIABLE
}}
string_length
)
math
(
EXPR offset
"0"
)
while
(
string_length GREATER 0
)
if
(
string_length GREATER
${
PARSE_AT_COLUMN
}
)
math
(
EXPR length
"
${
PARSE_AT_COLUMN
}
"
)
else
()
math
(
EXPR length
"
${
string_length
}
"
)
endif
()
string
(
SUBSTRING
${${
PARSE_VARIABLE
}}
${
offset
}
${
length
}
line
)
set
(
lines
"
${
lines
}
\n
${
line
}
"
)
math
(
EXPR string_length
"
${
string_length
}
-
${
length
}
"
)
math
(
EXPR offset
"
${
offset
}
+
${
length
}
"
)
endwhile
()
set
(
${
PARSE_VARIABLE
}
"
${
lines
}
"
PARENT_SCOPE
)
endfunction
()
function
(
generate_embed_source EMBED_NAME EMBED_DIR BASE_DIRECTORY
)
set
(
options
)
set
(
oneValueArgs
)
set
(
multiValueArgs SYMBOLS FILES
)
cmake_parse_arguments
(
PARSE
"
${
options
}
"
"
${
oneValueArgs
}
"
"
${
multiValueArgs
}
"
${
ARGN
}
)
set
(
RESOURCE_ID 100
)
list
(
LENGTH PARSE_SYMBOLS SYMBOLS_LEN
)
list
(
LENGTH PARSE_FILES FILES_LEN
)
if
(
NOT
${
SYMBOLS_LEN
}
EQUAL
${
FILES_LEN
}
)
message
(
FATAL_ERROR
"Symbols and objects dont match:
${
SYMBOLS_LEN
}
!=
${
FILES_LEN
}
"
)
endif
()
math
(
EXPR LEN
"
${
SYMBOLS_LEN
}
- 1"
)
foreach
(
idx RANGE
${
LEN
}
)
list
(
GET PARSE_SYMBOLS
${
idx
}
SYMBOL
)
list
(
GET PARSE_FILES
${
idx
}
FILE
)
file
(
RELATIVE_PATH BASE_NAME
"
${
BASE_DIRECTORY
}
"
${
FILE
}
)
if
(
EMBED_USE STREQUAL
"RC"
)
string
(
TOUPPER
"
${
SYMBOL
}
"
SYMBOL
)
string
(
APPEND FILE_IDS
"#define IDR_
${
SYMBOL
}
${
RESOURCE_ID
}
\n
"
)
file
(
TO_NATIVE_PATH
"
${
FILE
}
"
NATIVE_FILE
)
string
(
REPLACE
"
\\
"
"
\\\\
"
NATIVE_FILE
"
${
NATIVE_FILE
}
"
)
string
(
APPEND RC_FILE_MAPPING
"IDR_
${
SYMBOL
}
TEXTFILE
\"
${
NATIVE_FILE
}
\"\n
"
)
string
(
APPEND INIT_KERNELS
"
\n
{
\"
${
BASE_NAME
}
\"
, resource::read(IDR_
${
SYMBOL
}
)},"
)
math
(
EXPR RESOURCE_ID
"
${
RESOURCE_ID
}
+ 1"
OUTPUT_FORMAT DECIMAL
)
else
()
set
(
START_SYMBOL
"_binary_
${
SYMBOL
}
_start"
)
set
(
LENGTH_SYMBOL
"_binary_
${
SYMBOL
}
_length"
)
if
(
EMBED_USE STREQUAL
"LD"
)
string
(
APPEND EXTERNS
"
extern const char
${
START_SYMBOL
}
[];
extern const size_t _binary_
${
SYMBOL
}
_size;
const auto
${
LENGTH_SYMBOL
}
= reinterpret_cast<size_t>(&_binary_
${
SYMBOL
}
_size);
"
)
else
()
string
(
APPEND EXTERNS
"
extern const char
${
START_SYMBOL
}
[];
extern const size_t
${
LENGTH_SYMBOL
}
;
"
)
endif
()
string
(
APPEND INIT_KERNELS
"
{
\"
${
BASE_NAME
}
\"
, {
${
START_SYMBOL
}
,
${
LENGTH_SYMBOL
}
} },"
)
endif
()
endforeach
()
if
(
EMBED_USE STREQUAL
"RC"
)
file
(
WRITE
"
${
EMBED_DIR
}
/include/resource.h"
"
#define TEXTFILE 256
${
FILE_IDS
}
"
)
file
(
WRITE
"
${
EMBED_DIR
}
/resource.rc"
"
#include
\"
resource.h
\"
${
RC_FILE_MAPPING
}
"
)
set
(
EXTERNS
"
#include <Windows.h>
#include
\"
resource.h
\"
namespace resource {
std::string_view read(int id)
{
HMODULE handle = GetModuleHandle(nullptr);
HRSRC rc = FindResource(handle, MAKEINTRESOURCE(id), MAKEINTRESOURCE(TEXTFILE));
HGLOBAL data = LoadResource(handle, rc);
return {static_cast<const char*>(LockResource(data)), SizeofResource(handle, rc)};
}
}
"
)
set
(
EMBED_FILES
${
EMBED_DIR
}
/include/resource.h
${
EMBED_DIR
}
/resource.rc
)
endif
()
file
(
WRITE
"
${
EMBED_DIR
}
/include/
${
EMBED_NAME
}
.hpp"
"
#include <string_view>
#include <unordered_map>
#include <utility>
std::unordered_map<std::string_view, std::string_view>
${
EMBED_NAME
}
();
"
)
file
(
WRITE
"
${
EMBED_DIR
}
/
${
EMBED_NAME
}
.cpp"
"
#include <
${
EMBED_NAME
}
.hpp>
${
EXTERNS
}
std::unordered_map<std::string_view, std::string_view>
${
EMBED_NAME
}
()
{
static std::unordered_map<std::string_view, std::string_view> result = {
${
INIT_KERNELS
}
};
return result;
}
"
)
list
(
APPEND EMBED_FILES
${
EMBED_DIR
}
/
${
EMBED_NAME
}
.cpp
${
EMBED_DIR
}
/include/
${
EMBED_NAME
}
.hpp
)
set
(
EMBED_FILES
${
EMBED_FILES
}
PARENT_SCOPE
)
endfunction
()
function
(
embed_file FILE BASE_DIRECTORY
)
message
(
STATUS
"
${
FILE
}
"
)
file
(
RELATIVE_PATH REL_FILE
"
${
BASE_DIRECTORY
}
"
${
FILE
}
)
string
(
MAKE_C_IDENTIFIER
"
${
REL_FILE
}
"
OUTPUT_SYMBOL
)
get_filename_component
(
OUTPUT_FILE_DIR
"
${
REL_FILE
}
"
DIRECTORY
)
file
(
MAKE_DIRECTORY
"
${
CMAKE_CURRENT_BINARY_DIR
}
/
${
OUTPUT_FILE_DIR
}
"
)
if
(
EMBED_USE STREQUAL
"LD"
)
set
(
OUTPUT_FILE
"
${
CMAKE_CURRENT_BINARY_DIR
}
/
${
REL_FILE
}
.o"
)
add_custom_command
(
OUTPUT
"
${
OUTPUT_FILE
}
"
COMMAND
${
EMBED_LD
}
-r -o
"
${
OUTPUT_FILE
}
"
-z noexecstack --format=binary
"
${
REL_FILE
}
"
COMMAND
${
EMBED_OBJCOPY
}
--rename-section .data=.rodata,alloc,load,readonly,data,contents
"
${
OUTPUT_FILE
}
"
WORKING_DIRECTORY
"
${
BASE_DIRECTORY
}
"
DEPENDS
"
${
FILE
}
"
VERBATIM
)
set
(
OUTPUT_FILE
${
OUTPUT_FILE
}
PARENT_SCOPE
)
elseif
(
EMBED_USE STREQUAL
"CArrays"
)
set_property
(
DIRECTORY APPEND PROPERTY CMAKE_CONFIGURE_DEPENDS
${
FILE
}
)
set
(
OUTPUT_FILE
"
${
CMAKE_CURRENT_BINARY_DIR
}
/
${
REL_FILE
}
.cpp"
)
# reads source file contents as hex string
file
(
READ
${
FILE
}
HEX_STRING HEX
)
# wraps the hex string into multiple lines
embed_wrap_string
(
VARIABLE HEX_STRING AT_COLUMN 80
)
# adds '0x' prefix and comma suffix before and after every byte respectively
string
(
REGEX REPLACE
"([0-9a-f][0-9a-f])"
"0x
\\
1, "
ARRAY_VALUES
${
HEX_STRING
}
)
# removes trailing comma
string
(
REGEX REPLACE
", $"
""
ARRAY_VALUES
${
ARRAY_VALUES
}
)
file
(
WRITE
"
${
OUTPUT_FILE
}
"
"
#include <cstddef>
extern const char _binary_
${
OUTPUT_SYMBOL
}
_start[] = {
${
ARRAY_VALUES
}
};
extern const size_t _binary_
${
OUTPUT_SYMBOL
}
_length = sizeof(_binary_
${
OUTPUT_SYMBOL
}
_start);
"
)
set
(
OUTPUT_FILE
${
OUTPUT_FILE
}
PARENT_SCOPE
)
endif
()
set
(
OUTPUT_SYMBOL
${
OUTPUT_SYMBOL
}
PARENT_SCOPE
)
endfunction
()
function
(
add_embed_library EMBED_NAME
)
set
(
options
)
set
(
oneValueArgs RELATIVE
)
set
(
multiValueArgs
)
cmake_parse_arguments
(
PARSE
"
${
options
}
"
"
${
oneValueArgs
}
"
"
${
multiValueArgs
}
"
${
ARGN
}
)
set
(
EMBED_DIR
${
CMAKE_CURRENT_BINARY_DIR
}
/embed/
${
EMBED_NAME
}
)
file
(
MAKE_DIRECTORY
${
EMBED_DIR
}
)
message
(
STATUS
"Embedding kernel files:"
)
foreach
(
FILE
${
PARSE_UNPARSED_ARGUMENTS
}
)
embed_file
(
${
FILE
}
${
PARSE_RELATIVE
}
)
list
(
APPEND OUTPUT_FILES
${
OUTPUT_FILE
}
)
list
(
APPEND SYMBOLS
${
OUTPUT_SYMBOL
}
)
endforeach
()
message
(
STATUS
"Generating embedding library '
${
EMBED_NAME
}
'"
)
generate_embed_source
(
${
EMBED_NAME
}
${
EMBED_DIR
}
"
${
PARSE_RELATIVE
}
"
SYMBOLS
${
SYMBOLS
}
FILES
${
PARSE_UNPARSED_ARGUMENTS
}
)
set
(
INTERNAL_EMBED_LIB embed_lib_
${
EMBED_NAME
}
)
if
(
EMBED_USE STREQUAL
"LD"
)
add_library
(
${
INTERNAL_EMBED_LIB
}
STATIC
${
EMBED_FILES
}
${
OUTPUT_FILES
}
)
else
()
add_library
(
${
INTERNAL_EMBED_LIB
}
OBJECT
${
EMBED_FILES
}
)
endif
()
if
(
EMBED_USE STREQUAL
"CArrays"
)
target_sources
(
${
INTERNAL_EMBED_LIB
}
PRIVATE
${
OUTPUT_FILES
}
)
endif
()
target_include_directories
(
${
INTERNAL_EMBED_LIB
}
PRIVATE
"
${
EMBED_DIR
}
/include"
)
target_compile_options
(
${
INTERNAL_EMBED_LIB
}
PRIVATE -Wno-reserved-identifier -Wno-extern-initializer -Wno-missing-variable-declarations
)
set_target_properties
(
${
INTERNAL_EMBED_LIB
}
PROPERTIES POSITION_INDEPENDENT_CODE On
)
add_library
(
${
EMBED_NAME
}
INTERFACE
)
if
(
EMBED_USE STREQUAL
"RC"
)
target_link_libraries
(
${
EMBED_NAME
}
INTERFACE $<TARGET_OBJECTS:
${
INTERNAL_EMBED_LIB
}
>
)
elseif
(
EMBED_USE STREQUAL
"LD"
)
target_link_libraries
(
${
EMBED_NAME
}
INTERFACE
${
INTERNAL_EMBED_LIB
}
)
else
()
target_sources
(
${
EMBED_NAME
}
INTERFACE $<TARGET_OBJECTS:
${
INTERNAL_EMBED_LIB
}
>
)
endif
()
target_include_directories
(
${
EMBED_NAME
}
INTERFACE
"
${
EMBED_DIR
}
/include"
)
endfunction
()
codegen/CMakeLists.txt
0 → 100644
View file @
6ac1d6a2
cmake_minimum_required
(
VERSION 3.16
)
project
(
composable_kernel_host
)
set
(
CMAKE_EXPORT_COMPILE_COMMANDS ON
)
set
(
CMAKE_LIBRARY_OUTPUT_DIRECTORY
${
CMAKE_BINARY_DIR
}
/lib
)
set
(
CMAKE_ARCHIVE_OUTPUT_DIRECTORY
${
CMAKE_BINARY_DIR
}
/lib
)
set
(
CMAKE_RUNTIME_OUTPUT_DIRECTORY
${
CMAKE_BINARY_DIR
}
/bin
)
set
(
CK_ROOT
${
CMAKE_CURRENT_SOURCE_DIR
}
/..
)
find_package
(
ROCM
)
include
(
ROCMInstallTargets
)
include
(
ROCMTest
)
list
(
APPEND CMAKE_MODULE_PATH
${
CK_ROOT
}
/cmake
)
include
(
Embed
)
file
(
GLOB_RECURSE KERNEL_FILES CONFIGURE_DEPENDS
${
CK_ROOT
}
/include/ck/*.hpp
)
message
(
STATUS
"KERNEL_FILES:
${
KERNEL_FILES
}
"
)
message
(
STATUS
"RELATIVE:
${
CK_ROOT
}
/include"
)
add_embed_library
(
ck_headers
${
KERNEL_FILES
}
RELATIVE
${
CK_ROOT
}
/include
)
add_definitions
(
-std=c++17
)
file
(
GLOB SOURCES CONFIGURE_DEPENDS src/*.cpp
)
# TODO: Use object library
add_library
(
ck_host STATIC
${
SOURCES
}
)
target_link_libraries
(
ck_host PRIVATE ck_headers
)
set_target_properties
(
ck_host PROPERTIES
LINKER_LANGUAGE CXX
POSITION_INDEPENDENT_CODE ON
)
target_include_directories
(
ck_host PUBLIC
$<BUILD_INTERFACE:
${
CMAKE_CURRENT_SOURCE_DIR
}
/include>
)
add_executable
(
ck-template-driver driver/main.cpp
)
target_link_libraries
(
ck-template-driver ck_host
)
rocm_install
(
TARGETS ck_host ck_headers
EXPORT ck_hostTargets
)
rocm_install
(
DIRECTORY include/ck DESTINATION
${
CMAKE_INSTALL_INCLUDEDIR
}
)
if
(
BUILD_TESTING
)
add_subdirectory
(
test
)
endif
()
codegen/driver/main.cpp
0 → 100644
View file @
6ac1d6a2
#include <functional>
#include <iostream>
#include <string>
#include <unordered_map>
#include <vector>
#include "ck/host/device_gemm_multiple_d/operation.hpp"
#include "ck/host/stringutils.hpp"
using
ck
::
host
::
Transform
;
struct
Emitters
{
std
::
unordered_map
<
std
::
string
,
std
::
function
<
std
::
vector
<
std
::
string
>
()
>>
m
;
template
<
class
T
>
void
Register
(
const
std
::
string
&
name
)
{
m
[
name
]
=
[]
{
auto
configs
=
T
::
CreateOperations
();
return
Transform
(
configs
,
[](
const
auto
&
ops
)
{
return
ToTuple
(
ops
);
});
};
}
template
<
class
T
>
static
std
::
string
ToTuple
(
const
T
&
ops
)
{
auto
templates
=
Transform
(
ops
,
[](
const
auto
&
op
)
{
return
" "
+
op
.
ToSolution
().
ToTemplateString
();
});
return
"std::tuple<
\n
"
+
ck
::
host
::
JoinStrings
(
templates
,
",
\n
"
)
+
">"
;
}
std
::
string
Emit
(
const
std
::
string
&
name
)
{
return
ck
::
host
::
JoinStrings
(
m
.
at
(
name
)(),
"
\n
"
);
}
std
::
vector
<
std
::
string
>
List
()
const
{
return
Transform
(
m
,
[](
auto
&&
p
)
{
return
p
.
first
;
});
}
};
int
main
(
int
argc
,
const
char
*
argv
[])
{
std
::
string
prog
=
argv
[
0
];
std
::
vector
<
std
::
string
>
args
(
argv
+
1
,
argv
+
argc
);
Emitters
e
;
e
.
Register
<
ck
::
host
::
device_gemm_multiple_d
::
Operation_Xdl_CShuffle
>
(
"DeviceGemmMultipleD_Xdl_CShuffle"
);
if
(
args
.
empty
()
or
std
::
any_of
(
args
.
begin
(),
args
.
end
(),
[](
auto
arg
)
{
return
arg
==
"-h"
or
arg
==
"--help"
;
}))
{
std
::
cout
<<
"USAGE:"
<<
std
::
endl
;
std
::
cout
<<
" "
<<
prog
<<
" [TEMPLATE]"
<<
std
::
endl
;
std
::
cout
<<
std
::
endl
;
std
::
cout
<<
"FLAGS:"
<<
std
::
endl
;
std
::
cout
<<
" -h, --help Show help"
<<
std
::
endl
;
std
::
cout
<<
std
::
endl
;
std
::
cout
<<
"TEMPLATES:"
<<
std
::
endl
;
for
(
auto
x
:
e
.
List
())
std
::
cout
<<
" "
<<
x
<<
std
::
endl
;
std
::
cout
<<
std
::
endl
;
return
0
;
}
for
(
auto
name
:
args
)
std
::
cout
<<
e
.
Emit
(
name
)
<<
std
::
endl
;
return
0
;
}
codegen/include/ck/host/device_gemm_multiple_d.hpp
0 → 100644
View file @
6ac1d6a2
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <cstdlib>
#include <vector>
#include <memory>
#include <sstream>
#include <iterator>
#include <numeric>
#include "ck/host/types.hpp"
namespace
ck
{
namespace
host
{
namespace
device_gemm_multiple_d
{
struct
Problem
{
std
::
size_t
M
=
0
;
std
::
size_t
N
=
0
;
std
::
size_t
K
=
0
;
bool
TransA
=
false
;
bool
TransB
=
false
;
bool
TransE
=
false
;
std
::
vector
<
bool
>
DsTrans
=
{};
DataType
ADataType
=
DataType
::
Half
;
DataType
BDataType
=
DataType
::
Half
;
DataType
EDataType
=
DataType
::
Half
;
std
::
vector
<
DataType
>
DsDataType
=
{};
std
::
string
AElementOp
=
"ck::tensor_operation::element_wise::PassThrough"
;
std
::
string
BElementOp
=
"ck::tensor_operation::element_wise::PassThrough"
;
std
::
string
CDEElementOp
=
"ck::Tuple<>"
;
std
::
string
GetIncludeHeader
()
const
;
std
::
vector
<
Solution
>
GetSolutions
(
const
std
::
string
&
arch
)
const
;
};
}
// namespace device_gemm_multiple_d
}
// namespace host
}
// namespace ck
codegen/include/ck/host/device_gemm_multiple_d/operation.hpp
0 → 100644
View file @
6ac1d6a2
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <cstdlib>
#include <vector>
#include <string>
#include "ck/host/types.hpp"
#include "ck/host/operation/gemm.hpp"
#include "ck/host/device_gemm_multiple_d/problem.hpp"
namespace
ck
{
namespace
host
{
namespace
device_gemm_multiple_d
{
struct
Operation_Xdl_CShuffle
{
static
std
::
vector
<
std
::
vector
<
Operation_Xdl_CShuffle
>>
CreateOperations
();
static
std
::
vector
<
Operation_Xdl_CShuffle
>
CreateOperations
(
const
Problem
&
prob
);
TensorDesc
A
{};
TensorDesc
B
{};
DataType
acc
=
DataType
::
Float
;
DataType
cs_type
=
DataType
::
Half
;
std
::
vector
<
TensorDesc
>
Ds
=
{};
TensorDesc
E
{};
std
::
string
a_elem_op
=
PassThrough
;
std
::
string
b_elem_op
=
PassThrough
;
std
::
string
cde_elem_op
=
Bilinear
;
std
::
string
gemm_specialization
=
"ck::tensor_operation::device::GemmSpecialization::Default"
;
operation
::
TileDesc
tile_desc
{};
operation
::
BlockTransferDesc
a_block_transfer
{};
operation
::
BlockTransferDesc
b_block_transfer
{};
operation
::
CShuffleDesc
cshuffle
{};
operation
::
CBlockTransferDesc
c_block_transfer
{};
Solution
ToSolution
()
const
;
};
}
// namespace device_gemm_multiple_d
}
// namespace host
}
// namespace ck
codegen/include/ck/host/device_gemm_multiple_d/problem.hpp
0 → 100644
View file @
6ac1d6a2
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <cstdlib>
#include <vector>
#include <string>
#include "ck/host/types.hpp"
namespace
ck
{
namespace
host
{
namespace
device_gemm_multiple_d
{
struct
Problem
{
std
::
size_t
M
=
0
;
std
::
size_t
N
=
0
;
std
::
size_t
K
=
0
;
bool
TransA
=
false
;
bool
TransB
=
false
;
bool
TransE
=
false
;
std
::
vector
<
bool
>
DsTrans
=
{};
DataType
ADataType
=
DataType
::
Half
;
DataType
BDataType
=
DataType
::
Half
;
DataType
EDataType
=
DataType
::
Half
;
std
::
vector
<
DataType
>
DsDataType
=
{};
std
::
string
AElementOp
=
PassThrough
;
std
::
string
BElementOp
=
PassThrough
;
std
::
string
CDEElementOp
=
PassThrough
;
std
::
string
GetIncludeHeader
()
const
;
std
::
vector
<
Solution
>
GetSolutions
(
const
std
::
string
&
arch
)
const
;
};
}
// namespace device_gemm_multiple_d
}
// namespace host
}
// namespace ck
codegen/include/ck/host/headers.hpp
0 → 100644
View file @
6ac1d6a2
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <string>
#include <string_view>
#include <utility>
#include <unordered_map>
#include <vector>
namespace
ck
{
namespace
host
{
std
::
unordered_map
<
std
::
string_view
,
std
::
string_view
>
GetHeaders
();
}
// namespace host
}
// namespace ck
codegen/include/ck/host/operation/gemm.hpp
0 → 100644
View file @
6ac1d6a2
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <string>
namespace
ck
{
namespace
host
{
namespace
operation
{
struct
TileDesc
{
int
block_size
=
0
;
int
m_per_block
=
0
;
int
n_per_block
=
0
;
int
k_per_block
=
0
;
int
ak1
=
0
;
int
bk1
=
0
;
int
m_per_XDL
=
0
;
int
n_per_XDL
=
0
;
int
m_Xdl_per_wave
=
0
;
int
n_Xdl_per_wave
=
0
;
int
num_gemmk_prefetch_stage
=
0
;
};
struct
BlockTransferDesc
{
std
::
string
thread_cluster_length
=
""
;
std
::
string
thread_cluster_arrange_order
=
""
;
std
::
string
src_access_order
=
""
;
int
src_vec_dim
=
0
;
int
src_scalar_per_vector
=
0
;
int
dst_scalar_per_vector_k1
=
0
;
int
lds_add_extra_dim
=
0
;
};
struct
CShuffleDesc
{
int
m_Xdl_per_wave_per_shuffle
=
0
;
int
n_Xdl_per_wave_per_shuffle
=
0
;
};
struct
CBlockTransferDesc
{
std
::
string
cluster_lengths_m_block_m_wave_m_per_Xdl_n_block_n_wave_n_per_Xdl
=
""
;
int
scalar_per_vector_n_wave_n_per_Xdl
=
0
;
};
}
// namespace operation
}
// namespace host
}
// namespace ck
codegen/include/ck/host/stringutils.hpp
0 → 100644
View file @
6ac1d6a2
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <algorithm>
#include <cassert>
#include <numeric>
#include <string>
#include <utility>
#include <unordered_map>
#include <vector>
namespace
ck
{
namespace
host
{
template
<
class
F
>
std
::
string
trim
(
const
std
::
string
&
s
,
F
f
)
{
auto
start
=
std
::
find_if_not
(
s
.
begin
(),
s
.
end
(),
f
);
auto
last
=
std
::
find_if_not
(
s
.
rbegin
(),
std
::
string
::
const_reverse_iterator
(
start
),
f
).
base
();
return
{
start
,
last
};
}
inline
std
::
string
trim
(
const
std
::
string
&
s
)
{
return
trim
(
s
,
[](
unsigned
char
c
)
{
return
std
::
isspace
(
c
);
});
}
template
<
class
Strings
>
inline
std
::
string
JoinStrings
(
Strings
strings
,
const
std
::
string
&
delim
)
{
auto
it
=
strings
.
begin
();
if
(
it
==
strings
.
end
())
return
""
;
auto
nit
=
std
::
next
(
it
);
return
std
::
accumulate
(
nit
,
strings
.
end
(),
*
it
,
[
&
](
std
::
string
x
,
std
::
string
y
)
{
return
std
::
move
(
x
)
+
delim
+
std
::
move
(
y
);
});
}
template
<
class
F
>
inline
std
::
string
InterpolateString
(
const
std
::
string
&
input
,
F
f
,
std
::
string
start
=
"${"
,
std
::
string
end
=
"}"
)
{
std
::
string
result
=
""
;
result
.
reserve
(
input
.
size
());
auto
it
=
input
.
begin
();
while
(
it
!=
input
.
end
())
{
auto
next_start
=
std
::
search
(
it
,
input
.
end
(),
start
.
begin
(),
start
.
end
());
auto
next_end
=
std
::
search
(
next_start
,
input
.
end
(),
end
.
begin
(),
end
.
end
());
result
.
append
(
it
,
next_start
);
if
(
next_start
==
input
.
end
())
break
;
if
(
next_end
==
input
.
end
())
{
throw
std
::
runtime_error
(
"Unbalanced brackets"
);
}
auto
r
=
f
(
next_start
+
start
.
size
(),
next_end
);
result
.
append
(
r
.
begin
(),
r
.
end
());
it
=
next_end
+
end
.
size
();
}
return
result
;
}
inline
std
::
string
InterpolateString
(
const
std
::
string
&
input
,
const
std
::
unordered_map
<
std
::
string
,
std
::
string
>&
vars
,
std
::
string
start
=
"${"
,
std
::
string
end
=
"}"
)
{
return
InterpolateString
(
input
,
[
&
](
auto
start_it
,
auto
last_it
)
{
auto
key
=
trim
({
start_it
,
last_it
});
auto
it
=
vars
.
find
(
key
);
if
(
it
==
vars
.
end
())
throw
std
::
runtime_error
(
"Unknown key: "
+
key
);
return
it
->
second
;
},
std
::
move
(
start
),
std
::
move
(
end
));
}
template
<
class
Range
,
class
F
>
inline
auto
Transform
(
const
Range
&
r
,
F
f
)
->
std
::
vector
<
decltype
(
f
(
*
r
.
begin
()))
>
{
std
::
vector
<
decltype
(
f
(
*
r
.
begin
()))
>
result
;
std
::
transform
(
r
.
begin
(),
r
.
end
(),
std
::
back_inserter
(
result
),
f
);
return
result
;
}
template
<
class
Range1
,
class
Range2
,
class
F
>
inline
auto
Transform
(
const
Range1
&
r1
,
const
Range2
&
r2
,
F
f
)
->
std
::
vector
<
decltype
(
f
(
*
r1
.
begin
(),
*
r2
.
begin
()))
>
{
std
::
vector
<
decltype
(
f
(
*
r1
.
begin
(),
*
r2
.
begin
()))
>
result
;
assert
(
std
::
distance
(
r1
.
begin
(),
r1
.
end
())
==
std
::
distance
(
r2
.
begin
(),
r2
.
end
()));
std
::
transform
(
r1
.
begin
(),
r1
.
end
(),
r2
.
begin
(),
std
::
back_inserter
(
result
),
f
);
return
result
;
}
}
// namespace host
}
// namespace ck
codegen/include/ck/host/types.hpp
0 → 100644
View file @
6ac1d6a2
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <string>
#include <sstream>
#include <utility>
#include <unordered_map>
#include <vector>
namespace
ck
{
namespace
host
{
struct
Solution
{
Solution
()
=
default
;
Solution
(
std
::
string
str
,
std
::
unordered_map
<
std
::
string
,
std
::
string
>
values
);
std
::
string
ToTemplateString
()
const
;
std
::
string
GetTemplateParameter
(
const
std
::
string
&
name
)
const
;
template
<
class
T
>
T
GetTemplateParameter
(
const
std
::
string
&
name
)
const
{
T
result
;
std
::
stringstream
ss
(
GetTemplateParameter
(
name
));
ss
>>
result
;
return
result
;
}
private:
std
::
string
template_str
;
std
::
unordered_map
<
std
::
string
,
std
::
string
>
template_values
;
};
enum
class
DataType
{
Half
,
Float
,
Int8
,
Int32
};
std
::
string
ToString
(
DataType
dt
);
enum
class
Layout
{
Row
,
Column
};
std
::
string
ToString
(
Layout
dl
);
enum
class
GemmType
{
Default
};
std
::
string
ToString
(
GemmType
gt
);
struct
TensorDesc
{
DataType
element
;
Layout
layout
;
};
std
::
string
SequenceStr
(
const
std
::
vector
<
int
>&
v
);
std
::
string
MakeTuple
(
const
std
::
vector
<
std
::
string
>&
v
);
template
<
int
...
xs
>
const
std
::
string
S
=
SequenceStr
({
xs
...});
constexpr
const
char
*
PassThrough
=
"ck::tensor_operation::element_wise::PassThrough"
;
constexpr
const
char
*
Bilinear
=
"ck::tensor_operation::element_wise::Bilinear"
;
}
// namespace host
}
// namespace ck
Prev
1
2
3
4
5
6
7
8
9
…
16
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment