Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel_ROCM
Commits
10e8be48
Unverified
Commit
10e8be48
authored
Oct 01, 2024
by
M.Emin Ozturk
Committed by
GitHub
Oct 01, 2024
Browse files
Merge branch 'develop' into gemm_bf16_sk_muozturk
parents
b416c877
11b7a4db
Changes
48
Hide whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
176 additions
and
10 deletions
+176
-10
library/src/tensor_operation_instance/gpu/CMakeLists.txt
library/src/tensor_operation_instance/gpu/CMakeLists.txt
+6
-4
library/src/tensor_operation_instance/gpu/mha/CMakeLists.txt
library/src/tensor_operation_instance/gpu/mha/CMakeLists.txt
+16
-6
script/cmake-ck-dev.sh
script/cmake-ck-dev.sh
+3
-0
script/cmake-ck-release.sh
script/cmake-ck-release.sh
+3
-0
test/CMakeLists.txt
test/CMakeLists.txt
+1
-0
test/ck_tile/CMakeLists.txt
test/ck_tile/CMakeLists.txt
+1
-0
test/ck_tile/image_to_column/CMakeLists.txt
test/ck_tile/image_to_column/CMakeLists.txt
+4
-0
test/ck_tile/image_to_column/test_tile_image_to_column.cpp
test/ck_tile/image_to_column/test_tile_image_to_column.cpp
+142
-0
No files found.
library/src/tensor_operation_instance/gpu/CMakeLists.txt
View file @
10e8be48
...
@@ -102,12 +102,14 @@ function(add_instance_library INSTANCE_NAME)
...
@@ -102,12 +102,14 @@ function(add_instance_library INSTANCE_NAME)
set
(
FMHA_FWD_FAST_EXP2 true
)
set
(
FMHA_FWD_FAST_EXP2 true
)
endif
()
endif
()
if
(
FMHA_FWD_FAST_EXP2
)
if
(
FMHA_FWD_FAST_EXP2
)
list
(
APPEND
EXAMPLE_FMHA_FWD
_COMPILE_OPTIONS -Wno-undefined-func-template -DCK_TILE_FMHA_FWD_FAST_EXP2=1 -fgpu-flush-denormals-to-zero
)
list
(
APPEND
FMHA
_COMPILE_OPTIONS -Wno-undefined-func-template -DCK_TILE_FMHA_FWD_FAST_EXP2=1 -fgpu-flush-denormals-to-zero
)
else
()
else
()
list
(
APPEND
EXAMPLE_FMHA_FWD
_COMPILE_OPTIONS -Wno-undefined-func-template -DCK_TILE_FMHA_FWD_FAST_EXP2=0
)
list
(
APPEND
FMHA
_COMPILE_OPTIONS -Wno-undefined-func-template -DCK_TILE_FMHA_FWD_FAST_EXP2=0
)
endif
()
endif
()
list
(
APPEND EXAMPLE_FMHA_FWD_COMPILE_OPTIONS -Wno-float-equal
)
list
(
APPEND FMHA_COMPILE_OPTIONS -Wno-float-equal
)
target_compile_options
(
device_mha_instance PRIVATE
${
EXAMPLE_FMHA_FWD_COMPILE_OPTIONS
}
)
list
(
APPEND FMHA_COMPILE_OPTIONS -DCK_TILE_FMHA_FWD_SPLITKV_API=1
)
list
(
APPEND FMHA_COMPILE_OPTIONS -DCK_TILE_FMHA_FWD_APPENDKV_API=1
)
target_compile_options
(
device_mha_instance PRIVATE
${
FMHA_COMPILE_OPTIONS
}
)
endif
()
endif
()
target_compile_features
(
${
INSTANCE_NAME
}
PUBLIC
)
target_compile_features
(
${
INSTANCE_NAME
}
PUBLIC
)
...
...
library/src/tensor_operation_instance/gpu/mha/CMakeLists.txt
View file @
10e8be48
...
@@ -32,23 +32,33 @@ if(EXISTS ${FMHA_CPP_FOLDER}/blob_list.txt)
...
@@ -32,23 +32,33 @@ if(EXISTS ${FMHA_CPP_FOLDER}/blob_list.txt)
file
(
REMOVE
${
FMHA_CPP_FOLDER
}
/blob_list.txt
)
file
(
REMOVE
${
FMHA_CPP_FOLDER
}
/blob_list.txt
)
endif
()
endif
()
set
(
FMHA_KNOWN_APIS
"fwd,fwd_splitkv,fwd_appendkv,bwd"
)
# generate a list of kernels, but not actually emit files at config stage
# generate a list of kernels, but not actually emit files at config stage
# Note: The receipt 3 arg filters the generated backwards instances to reduce compilation time.
# With receipt 3 set, we are generating instances for datatype == {fp16 || bfp16}, bias == {no || alibi}, deterministic == off, and dpad == dvpad.
execute_process
(
execute_process
(
COMMAND
${
PYTHON_EXECUTABLE
}
${
CMAKE_SOURCE_DIR
}
/example/ck_tile/01_fmha
/generate.py
COMMAND
${
PYTHON_EXECUTABLE
}
${
FMHA_SRC_FOLDER
}
/generate.py
--list_blobs
${
FMHA_CPP_FOLDER
}
/blob_list.txt
--list_blobs
${
FMHA_CPP_FOLDER
}
/blob_list.txt
--api
${
FMHA_KNOWN_APIS
}
--receipt 3
RESULT_VARIABLE ret
RESULT_VARIABLE ret
)
)
if
(
ret AND NOT ret EQUAL 0
)
if
(
ret AND NOT ret EQUAL 0
)
message
(
FATAL_ERROR
"CK Tile MHA FAILED to genrate a list of kernels via Python."
)
message
(
FATAL_ERROR
"CK Tile MHA FAILED to genrate a list of kernels via Python."
)
else
()
else
()
file
(
STRINGS
${
FMHA_CPP_FOLDER
}
/blob_list.txt FMHA_
FWD_
GEN_BLOBS
)
file
(
STRINGS
${
FMHA_CPP_FOLDER
}
/blob_list.txt FMHA_GEN_BLOBS
)
endif
()
endif
()
# actually generate the kernel content now
# actually generate the kernel content now
# Note: The receipt 3 arg filters the generated backwards instances to reduce compilation time.
# With receipt 3 set, we are generating instances for datatype == {fp16 || bfp16}, bias == {no || alibi}, deterministic == off, and dpad == dvpad.
add_custom_command
(
add_custom_command
(
OUTPUT
${
FMHA_
FWD_
GEN_BLOBS
}
OUTPUT
${
FMHA_GEN_BLOBS
}
COMMAND
${
PYTHON_EXECUTABLE
}
${
CMAKE_SOURCE_DIR
}
/example/ck_tile/01_fmha
/generate.py
COMMAND
${
PYTHON_EXECUTABLE
}
${
FMHA_SRC_FOLDER
}
/generate.py
--output_dir
${
FMHA_CPP_FOLDER
}
--output_dir
${
FMHA_CPP_FOLDER
}
--api
${
FMHA_KNOWN_APIS
}
--receipt 3
COMMENT
"Generating mha kernel (cpp) files now ..."
COMMENT
"Generating mha kernel (cpp) files now ..."
VERBATIM
VERBATIM
)
)
...
@@ -57,12 +67,12 @@ add_custom_command(
...
@@ -57,12 +67,12 @@ add_custom_command(
# have filename. Since, it was cauing the cmake
# have filename. Since, it was cauing the cmake
# to throw "File name too long"
# to throw "File name too long"
set
(
device_files
)
set
(
device_files
)
foreach
(
filepath IN LISTS FMHA_
FWD_
GEN_BLOBS
)
foreach
(
filepath IN LISTS FMHA_GEN_BLOBS
)
get_filename_component
(
filename
${
filepath
}
NAME
)
get_filename_component
(
filename
${
filepath
}
NAME
)
# Append the filename to the device_files list
# Append the filename to the device_files list
list
(
APPEND device_files
${
filename
}
)
list
(
APPEND device_files
${
filename
}
)
endforeach
()
endforeach
()
add_custom_target
(
generate_cpp_files DEPENDS
${
FMHA_
FWD_
GEN_BLOBS
}
)
add_custom_target
(
generate_cpp_files DEPENDS
${
FMHA_GEN_BLOBS
}
)
add_instance_library
(
device_mha_instance
${
device_files
}
)
add_instance_library
(
device_mha_instance
${
device_files
}
)
...
...
script/cmake-ck-dev.sh
View file @
10e8be48
...
@@ -7,8 +7,10 @@ MY_PROJECT_SOURCE=$1
...
@@ -7,8 +7,10 @@ MY_PROJECT_SOURCE=$1
if
[
$#
-ge
2
]
;
then
if
[
$#
-ge
2
]
;
then
GPU_TARGETS
=
$2
GPU_TARGETS
=
$2
REST_ARGS
=
${
@
:3
}
else
else
GPU_TARGETS
=
"gfx908;gfx90a;gfx940"
GPU_TARGETS
=
"gfx908;gfx90a;gfx940"
REST_ARGS
=
fi
fi
cmake
\
cmake
\
...
@@ -20,4 +22,5 @@ cmake
...
@@ -20,4 +22,5 @@ cmake
-D
GPU_TARGETS
=
$GPU_TARGETS
\
-D
GPU_TARGETS
=
$GPU_TARGETS
\
-D
CMAKE_VERBOSE_MAKEFILE:BOOL
=
ON
\
-D
CMAKE_VERBOSE_MAKEFILE:BOOL
=
ON
\
-D
USE_BITINT_EXTENSION_INT4
=
OFF
\
-D
USE_BITINT_EXTENSION_INT4
=
OFF
\
$REST_ARGS
\
${
MY_PROJECT_SOURCE
}
${
MY_PROJECT_SOURCE
}
script/cmake-ck-release.sh
View file @
10e8be48
...
@@ -7,8 +7,10 @@ MY_PROJECT_SOURCE=$1
...
@@ -7,8 +7,10 @@ MY_PROJECT_SOURCE=$1
if
[
$#
-ge
2
]
;
then
if
[
$#
-ge
2
]
;
then
GPU_TARGETS
=
$2
GPU_TARGETS
=
$2
REST_ARGS
=
${
@
:3
}
else
else
GPU_TARGETS
=
"gfx908;gfx90a;gfx940"
GPU_TARGETS
=
"gfx908;gfx90a;gfx940"
REST_ARGS
=
fi
fi
cmake
\
cmake
\
...
@@ -20,5 +22,6 @@ cmake
...
@@ -20,5 +22,6 @@ cmake
-D
GPU_TARGETS
=
$GPU_TARGETS
\
-D
GPU_TARGETS
=
$GPU_TARGETS
\
-D
CMAKE_VERBOSE_MAKEFILE:BOOL
=
ON
\
-D
CMAKE_VERBOSE_MAKEFILE:BOOL
=
ON
\
-D
USE_BITINT_EXTENSION_INT4
=
OFF
\
-D
USE_BITINT_EXTENSION_INT4
=
OFF
\
$REST_ARGS
\
${
MY_PROJECT_SOURCE
}
${
MY_PROJECT_SOURCE
}
test/CMakeLists.txt
View file @
10e8be48
...
@@ -173,6 +173,7 @@ function(add_gtest_executable TEST_NAME)
...
@@ -173,6 +173,7 @@ function(add_gtest_executable TEST_NAME)
endfunction
()
endfunction
()
add_compile_options
(
-Wno-c++20-extensions
)
add_compile_options
(
-Wno-c++20-extensions
)
add_subdirectory
(
ck_tile
)
add_subdirectory
(
magic_number_division
)
add_subdirectory
(
magic_number_division
)
add_subdirectory
(
space_filling_curve
)
add_subdirectory
(
space_filling_curve
)
add_subdirectory
(
conv_util
)
add_subdirectory
(
conv_util
)
...
...
test/ck_tile/CMakeLists.txt
0 → 100644
View file @
10e8be48
add_subdirectory
(
image_to_column
)
test/ck_tile/image_to_column/CMakeLists.txt
0 → 100644
View file @
10e8be48
# Currently ck_tile is only built on gfx9
if
(
GPU_TARGETS MATCHES
"gfx9"
)
add_gtest_executable
(
test_tile_image_to_column test_tile_image_to_column.cpp
)
endif
()
test/ck_tile/image_to_column/test_tile_image_to_column.cpp
0 → 100644
View file @
10e8be48
// SPDX-License-Identifier: MIT
// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
#include <algorithm>
#include <gtest/gtest.h>
#include "ck_tile/host.hpp"
#include "ck_tile/core.hpp"
#include "ck_tile/host/kernel_launch.hpp"
#include "ck_tile/ops/image_to_column.hpp"
// Host API implementation
template
<
typename
DataType
>
class
TestCkTileImageToColumn
:
public
::
testing
::
Test
{
static
constexpr
ck_tile
::
index_t
VectorSize
=
1
;
static
constexpr
ck_tile
::
index_t
NDimSpatial
=
2
;
protected:
void
Run
(
const
ck_tile
::
conv
::
ConvParam
conv_params
)
{
using
ImLayout
=
ck_tile
::
tensor_layout
::
convolution
::
NHWGC
;
const
auto
G
=
conv_params
.
G_
;
const
auto
N
=
conv_params
.
N_
;
const
auto
C
=
conv_params
.
C_
;
const
ck_tile
::
long_index_t
NDoHoWo
=
N
*
std
::
accumulate
(
conv_params
.
output_spatial_lengths_
.
begin
(),
std
::
next
(
conv_params
.
output_spatial_lengths_
.
begin
(),
NDimSpatial
),
1
,
std
::
multiplies
<>
());
const
ck_tile
::
long_index_t
CZYX
=
C
*
std
::
accumulate
(
conv_params
.
filter_spatial_lengths_
.
begin
(),
std
::
next
(
conv_params
.
filter_spatial_lengths_
.
begin
(),
NDimSpatial
),
1
,
std
::
multiplies
<>
());
const
auto
in_desc
=
ck_tile
::
conv
::
make_input_host_tensor_descriptor_g_n_c_wis_packed
<
ImLayout
>
(
conv_params
);
const
auto
out_desc
=
ck_tile
::
HostTensorDescriptor
({
G
,
NDoHoWo
,
CZYX
});
// host verify
ck_tile
::
HostTensor
<
DataType
>
in
(
in_desc
);
ck_tile
::
HostTensor
<
DataType
>
out_device
(
out_desc
);
ck_tile
::
HostTensor
<
DataType
>
out_host
(
out_desc
);
std
::
cout
<<
"input: "
<<
in
.
mDesc
<<
std
::
endl
;
std
::
cout
<<
"output: "
<<
out_device
.
mDesc
<<
std
::
endl
;
ck_tile
::
FillUniformDistributionIntegerValue
<
DataType
>
{
-
5.
f
,
5.
f
}(
in
);
ck_tile
::
DeviceMem
in_device_buf
(
in
.
get_element_space_size_in_bytes
());
ck_tile
::
DeviceMem
out_device_buf
(
out_device
.
get_element_space_size_in_bytes
());
in_device_buf
.
ToDevice
(
in
.
data
());
using
thread_tile
=
ck_tile
::
sequence
<
4
,
4
>
;
using
warp_tile
=
ck_tile
::
sequence
<
8
,
128
>
;
using
block_tile
=
ck_tile
::
sequence
<
32
,
128
>
;
using
Shape
=
ck_tile
::
TileImageToColumnShape
<
thread_tile
,
warp_tile
,
block_tile
>
;
using
PipelineProblem
=
ck_tile
::
BlockImageToColumnProblem
<
DataType
,
DataType
,
Shape
,
NDimSpatial
,
VectorSize
,
VectorSize
>
;
using
Kernel
=
ck_tile
::
ImageToColumn
<
PipelineProblem
>
;
auto
kargs
=
Kernel
::
MakeKargs
(
in_device_buf
.
GetDeviceBuffer
(),
out_device_buf
.
GetDeviceBuffer
(),
G
,
N
,
C
,
ck_tile
::
to_array
<
ck_tile
::
long_index_t
,
NDimSpatial
>
(
conv_params
.
input_spatial_lengths_
),
ck_tile
::
to_array
<
ck_tile
::
long_index_t
,
NDimSpatial
>
(
conv_params
.
filter_spatial_lengths_
),
ck_tile
::
to_array
<
ck_tile
::
long_index_t
,
NDimSpatial
>
(
conv_params
.
output_spatial_lengths_
),
ck_tile
::
to_array
<
ck_tile
::
long_index_t
,
NDimSpatial
+
3
>
(
in_desc
.
get_strides
()),
ck_tile
::
to_array
<
ck_tile
::
long_index_t
,
3
>
(
out_desc
.
get_strides
()),
ck_tile
::
to_array
<
ck_tile
::
long_index_t
,
NDimSpatial
>
(
conv_params
.
conv_filter_strides_
),
ck_tile
::
to_array
<
ck_tile
::
long_index_t
,
NDimSpatial
>
(
conv_params
.
conv_filter_dilations_
),
ck_tile
::
to_array
<
ck_tile
::
long_index_t
,
NDimSpatial
>
(
conv_params
.
input_left_pads_
),
ck_tile
::
to_array
<
ck_tile
::
long_index_t
,
NDimSpatial
>
(
conv_params
.
input_right_pads_
));
const
dim3
grids
=
Kernel
::
GridSize
(
kargs
.
N
*
kargs
.
output_spatial_lengths
[
0
]
*
kargs
.
output_spatial_lengths
[
1
],
kargs
.
filter_spatial_lengths
[
0
]
*
kargs
.
filter_spatial_lengths
[
1
]
*
kargs
.
C
,
kargs
.
G
);
constexpr
dim3
blocks
=
Kernel
::
BlockSize
();
constexpr
ck_tile
::
index_t
kBlockPerCu
=
2
;
ck_tile
::
launch_kernel
(
ck_tile
::
stream_config
{},
ck_tile
::
make_kernel
<
blocks
.
x
,
kBlockPerCu
>
(
Kernel
{},
grids
,
blocks
,
0
,
kargs
));
// reference
ck_tile
::
reference_im2col
<
DataType
,
DataType
,
NDimSpatial
>
(
in
,
out_host
,
conv_params
);
out_device_buf
.
FromDevice
(
out_device
.
data
());
bool
pass
=
ck_tile
::
check_err
(
out_device
,
out_host
);
EXPECT_TRUE
(
pass
);
}
};
class
TestCkTileImageToColumnFloat
:
public
TestCkTileImageToColumn
<
float
>
{
};
class
TestCkTileImageToColumnHalf
:
public
TestCkTileImageToColumn
<
ck_tile
::
half_t
>
{
};
TEST_F
(
TestCkTileImageToColumnFloat
,
TestCorrectness
)
{
this
->
Run
({
2
,
2
,
4
,
1
,
192
,
{
3
,
3
},
{
28
,
28
},
{
1
,
1
},
{
1
,
1
},
{
1
,
1
},
{
1
,
1
}});
this
->
Run
({
2
,
2
,
64
,
1
,
64
,
{
3
,
3
},
{
14
,
14
},
{
1
,
1
},
{
1
,
1
},
{
1
,
1
},
{
1
,
1
}});
this
->
Run
({
2
,
1
,
64
,
1
,
64
,
{
1
,
1
},
{
7
,
7
},
{
3
,
3
},
{
1
,
1
},
{
0
,
0
},
{
0
,
0
}});
this
->
Run
({
2
,
1
,
64
,
1
,
64
,
{
1
,
1
},
{
3
,
3
},
{
1
,
1
},
{
1
,
1
},
{
0
,
0
},
{
0
,
0
}});
this
->
Run
({
2
,
2
,
64
,
1
,
64
,
{
3
,
3
},
{
28
,
28
},
{
2
,
2
},
{
2
,
2
},
{
1
,
1
},
{
1
,
1
}});
}
TEST_F
(
TestCkTileImageToColumnHalf
,
TestCorrectness
)
{
this
->
Run
({
2
,
2
,
4
,
1
,
192
,
{
3
,
3
},
{
28
,
28
},
{
1
,
1
},
{
1
,
1
},
{
1
,
1
},
{
1
,
1
}});
this
->
Run
({
2
,
2
,
64
,
1
,
64
,
{
3
,
3
},
{
14
,
14
},
{
1
,
1
},
{
1
,
1
},
{
1
,
1
},
{
1
,
1
}});
this
->
Run
({
2
,
1
,
64
,
1
,
64
,
{
1
,
1
},
{
7
,
7
},
{
3
,
3
},
{
1
,
1
},
{
0
,
0
},
{
0
,
0
}});
this
->
Run
({
2
,
1
,
64
,
1
,
64
,
{
1
,
1
},
{
3
,
3
},
{
1
,
1
},
{
1
,
1
},
{
0
,
0
},
{
0
,
0
}});
this
->
Run
({
2
,
2
,
64
,
1
,
64
,
{
3
,
3
},
{
28
,
28
},
{
2
,
2
},
{
2
,
2
},
{
1
,
1
},
{
1
,
1
}});
}
Prev
1
2
3
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment