Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel_ROCM
Commits
6975cb8f
Commit
6975cb8f
authored
Sep 16, 2024
by
carlushuang
Browse files
Merge remote-tracking branch 'origin/develop' into ck_tile/fav3_fwd_sept
parents
33aff2ef
6834e5ee
Changes
33
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
599 additions
and
75 deletions
+599
-75
CMakeLists.txt
CMakeLists.txt
+5
-1
Jenkinsfile
Jenkinsfile
+58
-8
codegen/test/CMakeLists.txt
codegen/test/CMakeLists.txt
+3
-0
codegen/test/rtc/include/rtc/compile_kernel.hpp
codegen/test/rtc/include/rtc/compile_kernel.hpp
+2
-2
codegen/test/rtc/include/rtc/tmp_dir.hpp
codegen/test/rtc/include/rtc/tmp_dir.hpp
+2
-2
codegen/test/rtc/src/compile_kernel.cpp
codegen/test/rtc/src/compile_kernel.cpp
+4
-4
codegen/test/rtc/src/tmp_dir.cpp
codegen/test/rtc/src/tmp_dir.cpp
+3
-3
docs/sphinx/requirements.in
docs/sphinx/requirements.in
+1
-1
docs/sphinx/requirements.txt
docs/sphinx/requirements.txt
+1
-1
example/ck_tile/01_fmha/CMakeLists.txt
example/ck_tile/01_fmha/CMakeLists.txt
+8
-0
example/ck_tile/03_gemm/gemm_basic.cpp
example/ck_tile/03_gemm/gemm_basic.cpp
+126
-42
example/ck_tile/03_gemm/gemm_basic.hpp
example/ck_tile/03_gemm/gemm_basic.hpp
+1
-0
example/ck_tile/03_gemm/script/run_full_test.sh
example/ck_tile/03_gemm/script/run_full_test.sh
+25
-0
example/ck_tile/03_gemm/script/smoke_test.sh
example/ck_tile/03_gemm/script/smoke_test.sh
+35
-0
include/ck/filesystem.hpp
include/ck/filesystem.hpp
+135
-0
include/ck_tile/host/reference/reference_gemm.hpp
include/ck_tile/host/reference/reference_gemm.hpp
+118
-1
include/ck_tile/ops/gemm/kernel/gemm_kernel.hpp
include/ck_tile/ops/gemm/kernel/gemm_kernel.hpp
+2
-3
include/ck_tile/ops/gemm/kernel/gemm_tile_partitioner.hpp
include/ck_tile/ops/gemm/kernel/gemm_tile_partitioner.hpp
+3
-7
library/include/ck/library/tensor_operation_instance/gpu/max_pool_bwd.hpp
...ck/library/tensor_operation_instance/gpu/max_pool_bwd.hpp
+9
-0
library/include/ck/library/tensor_operation_instance/gpu/pool2d_fwd.hpp
...e/ck/library/tensor_operation_instance/gpu/pool2d_fwd.hpp
+58
-0
No files found.
CMakeLists.txt
View file @
6975cb8f
...
@@ -26,11 +26,15 @@ set(version 1.1.0)
...
@@ -26,11 +26,15 @@ set(version 1.1.0)
project
(
composable_kernel VERSION
${
version
}
LANGUAGES CXX HIP
)
project
(
composable_kernel VERSION
${
version
}
LANGUAGES CXX HIP
)
include
(
CTest
)
include
(
CTest
)
# Usage: for customized Python location cmake -DCK_USE_ALTERNATIVE_PYTHON="/opt/Python-3.8.13/bin/python3.8"
# CK Codegen requires dataclass which is added in Python 3.7
# Python version 3.8 is required for general good practice as it is default for Ubuntu 20.04
if
(
NOT CK_USE_ALTERNATIVE_PYTHON
)
if
(
NOT CK_USE_ALTERNATIVE_PYTHON
)
find_package
(
Python3 3.
6
COMPONENTS Interpreter REQUIRED
)
find_package
(
Python3 3.
8
COMPONENTS Interpreter REQUIRED
)
else
()
else
()
message
(
"Using alternative python version"
)
message
(
"Using alternative python version"
)
set
(
EXTRA_PYTHON_PATH
)
set
(
EXTRA_PYTHON_PATH
)
# this is overly restrictive, we may need to be more flexible on the following
string
(
REPLACE
"/bin/python3.8"
""
EXTRA_PYTHON_PATH
"
${
CK_USE_ALTERNATIVE_PYTHON
}
"
)
string
(
REPLACE
"/bin/python3.8"
""
EXTRA_PYTHON_PATH
"
${
CK_USE_ALTERNATIVE_PYTHON
}
"
)
message
(
"alternative python path is:
${
EXTRA_PYTHON_PATH
}
"
)
message
(
"alternative python path is:
${
EXTRA_PYTHON_PATH
}
"
)
find_package
(
Python3 3.6 COMPONENTS Interpreter REQUIRED
)
find_package
(
Python3 3.6 COMPONENTS Interpreter REQUIRED
)
...
...
Jenkinsfile
View file @
6975cb8f
...
@@ -703,7 +703,7 @@ def process_results(Map conf=[:]){
...
@@ -703,7 +703,7 @@ def process_results(Map conf=[:]){
}
}
//launch develop branch daily at 23:00 UT in FULL_QA mode and at 19:00 UT with latest staging compiler version
//launch develop branch daily at 23:00 UT in FULL_QA mode and at 19:00 UT with latest staging compiler version
CRON_SETTINGS
=
BRANCH_NAME
==
"develop"
?
'''0 23 * * * % RUN_FULL_QA=true;ROCMVERSION=6.2;
RUN_CK_TILE_TESTS=true
CRON_SETTINGS
=
BRANCH_NAME
==
"develop"
?
'''0 23 * * * % RUN_FULL_QA=true;ROCMVERSION=6.2;RUN_CK_TILE_
FMHA_TESTS=;RUN_CK_TILE_GEMM_
TESTS=true
0 21 * * * % ROCMVERSION=6.2;hipTensor_test=true
0 21 * * * % ROCMVERSION=6.2;hipTensor_test=true
0 19 * * * % BUILD_DOCKER=true;DL_KERNELS=true;COMPILER_VERSION=amd-staging;BUILD_COMPILER=/llvm-project/build/bin/clang++;BUILD_GFX12=true;USE_SCCACHE=false;NINJA_BUILD_TRACE=true
0 19 * * * % BUILD_DOCKER=true;DL_KERNELS=true;COMPILER_VERSION=amd-staging;BUILD_COMPILER=/llvm-project/build/bin/clang++;BUILD_GFX12=true;USE_SCCACHE=false;NINJA_BUILD_TRACE=true
0 17 * * * % BUILD_DOCKER=true;DL_KERNELS=true;COMPILER_VERSION=amd-mainline-open;BUILD_COMPILER=/llvm-project/build/bin/clang++;BUILD_GFX12=true;USE_SCCACHE=false;NINJA_BUILD_TRACE=true
0 17 * * * % BUILD_DOCKER=true;DL_KERNELS=true;COMPILER_VERSION=amd-mainline-open;BUILD_COMPILER=/llvm-project/build/bin/clang++;BUILD_GFX12=true;USE_SCCACHE=false;NINJA_BUILD_TRACE=true
...
@@ -775,9 +775,13 @@ pipeline {
...
@@ -775,9 +775,13 @@ pipeline {
defaultValue:
false
,
defaultValue:
false
,
description:
"Run the grouped conv large cases tests (default: OFF)"
)
description:
"Run the grouped conv large cases tests (default: OFF)"
)
booleanParam
(
booleanParam
(
name:
"RUN_CK_TILE_TESTS"
,
name:
"RUN_CK_TILE_
FMHA_
TESTS"
,
defaultValue:
false
,
defaultValue:
false
,
description:
"Run the ck_tile tests (default: OFF)"
)
description:
"Run the ck_tile FMHA tests (default: OFF)"
)
booleanParam
(
name:
"RUN_CK_TILE_GEMM_TESTS"
,
defaultValue:
false
,
description:
"Run the ck_tile GEMM tests (default: OFF)"
)
booleanParam
(
booleanParam
(
name:
"BUILD_INSTANCES_ONLY"
,
name:
"BUILD_INSTANCES_ONLY"
,
defaultValue:
false
,
defaultValue:
false
,
...
@@ -894,15 +898,15 @@ pipeline {
...
@@ -894,15 +898,15 @@ pipeline {
}
}
}
}
}
}
stage
(
"Run CK_TILE Tests"
)
stage
(
"Run CK_TILE
_FMHA
Tests"
)
{
{
parallel
parallel
{
{
stage
(
"Run CK_TILE Tests on gfx90a"
)
stage
(
"Run CK_TILE
_FMHA
Tests on gfx90a"
)
{
{
when
{
when
{
beforeAgent
true
beforeAgent
true
expression
{
params
.
RUN_CK_TILE_TESTS
.
toBoolean
()
}
expression
{
params
.
RUN_CK_TILE_
FMHA_
TESTS
.
toBoolean
()
}
}
}
agent
{
label
rocmnode
(
"gfx90a"
)
}
agent
{
label
rocmnode
(
"gfx90a"
)
}
environment
{
environment
{
...
@@ -917,11 +921,11 @@ pipeline {
...
@@ -917,11 +921,11 @@ pipeline {
cleanWs
()
cleanWs
()
}
}
}
}
stage
(
"Run CK_TILE Tests on gfx942"
)
stage
(
"Run CK_TILE
_FMHA
Tests on gfx942"
)
{
{
when
{
when
{
beforeAgent
true
beforeAgent
true
expression
{
params
.
RUN_CK_TILE_TESTS
.
toBoolean
()
}
expression
{
params
.
RUN_CK_TILE_
FMHA_
TESTS
.
toBoolean
()
}
}
}
agent
{
label
rocmnode
(
"gfx942"
)
}
agent
{
label
rocmnode
(
"gfx942"
)
}
environment
{
environment
{
...
@@ -937,6 +941,52 @@ pipeline {
...
@@ -937,6 +941,52 @@ pipeline {
}
}
}
}
}
}
}
stage
(
"Run CK_TILE_GEMM Tests"
)
{
parallel
{
stage
(
"Run CK_TILE_GEMM Tests on gfx90a"
)
{
when
{
beforeAgent
true
expression
{
params
.
RUN_CK_TILE_GEMM_TESTS
.
toBoolean
()
}
}
agent
{
label
rocmnode
(
"gfx90a"
)
}
environment
{
setup_args
=
"NO_CK_BUILD"
execute_args
=
""" ../script/cmake-ck-dev.sh ../ gfx90a && \
make -j64 tile_example_gemm_basic && \
cd ../ &&
example/ck_tile/03_gemm/script/run_full_test.sh "CI_${params.COMPILER_VERSION}" "${env.BRANCH_NAME}" "${NODE_NAME}" gfx90a """
}
steps
{
buildHipClangJobAndReboot
(
setup_args:
setup_args
,
no_reboot:
true
,
build_type:
'Release'
,
execute_cmd:
execute_args
)
cleanWs
()
}
}
stage
(
"Run CK_TILE_GEMM Tests on gfx942"
)
{
when
{
beforeAgent
true
expression
{
params
.
RUN_CK_TILE_GEMM_TESTS
.
toBoolean
()
}
}
agent
{
label
rocmnode
(
"gfx942"
)
}
environment
{
setup_args
=
"NO_CK_BUILD"
execute_args
=
""" ../script/cmake-ck-dev.sh ../ gfx942 && \
make -j64 tile_example_gemm_basic && \
cd ../ &&
example/ck_tile/03_gemm/script/run_full_test.sh "CI_${params.COMPILER_VERSION}" "${env.BRANCH_NAME}" "${NODE_NAME}" gfx942 """
}
steps
{
buildHipClangJobAndReboot
(
setup_args:
setup_args
,
no_reboot:
true
,
build_type:
'Release'
,
execute_cmd:
execute_args
)
cleanWs
()
}
}
}
}
}
stage
(
"Build CK and run Tests"
)
stage
(
"Build CK and run Tests"
)
{
{
...
...
codegen/test/CMakeLists.txt
View file @
6975cb8f
...
@@ -6,6 +6,9 @@ if(NOT INSTANCES_ONLY)
...
@@ -6,6 +6,9 @@ if(NOT INSTANCES_ONLY)
set_source_files_properties
(
${
TEST_SRC
}
PROPERTIES LANGUAGE HIP
)
set_source_files_properties
(
${
TEST_SRC
}
PROPERTIES LANGUAGE HIP
)
get_filename_component
(
BASE_NAME
${
TEST_SRC
}
NAME_WE
)
get_filename_component
(
BASE_NAME
${
TEST_SRC
}
NAME_WE
)
add_executable
(
codegen_test_
${
BASE_NAME
}
${
TEST_SRC
}
)
add_executable
(
codegen_test_
${
BASE_NAME
}
${
TEST_SRC
}
)
if
(
CK_USE_ALTERNATIVE_PYTHON
)
target_link_options
(
codegen_test_
${
BASE_NAME
}
PRIVATE -lstdc++fs
)
endif
()
add_dependencies
(
codegen codegen_test_
${
BASE_NAME
}
)
add_dependencies
(
codegen codegen_test_
${
BASE_NAME
}
)
add_dependencies
(
tests codegen_test_
${
BASE_NAME
}
)
add_dependencies
(
tests codegen_test_
${
BASE_NAME
}
)
add_dependencies
(
check codegen_test_
${
BASE_NAME
}
)
add_dependencies
(
check codegen_test_
${
BASE_NAME
}
)
...
...
codegen/test/rtc/include/rtc/compile_kernel.hpp
View file @
6975cb8f
...
@@ -2,14 +2,14 @@
...
@@ -2,14 +2,14 @@
#define GUARD_HOST_TEST_RTC_INCLUDE_RTC_COMPILE_KERNEL
#define GUARD_HOST_TEST_RTC_INCLUDE_RTC_COMPILE_KERNEL
#include <rtc/kernel.hpp>
#include <rtc/kernel.hpp>
#include <filesystem>
#include <
ck/
filesystem
.hpp
>
#include <string>
#include <string>
namespace
rtc
{
namespace
rtc
{
struct
src_file
struct
src_file
{
{
std
::
filesystem
::
path
path
;
CK
::
fs
::
path
path
;
std
::
string_view
content
;
std
::
string_view
content
;
};
};
...
...
codegen/test/rtc/include/rtc/tmp_dir.hpp
View file @
6975cb8f
...
@@ -2,13 +2,13 @@
...
@@ -2,13 +2,13 @@
#define GUARD_HOST_TEST_RTC_INCLUDE_RTC_TMP_DIR
#define GUARD_HOST_TEST_RTC_INCLUDE_RTC_TMP_DIR
#include <string>
#include <string>
#include <filesystem>
#include <
ck/
filesystem
.hpp
>
namespace
rtc
{
namespace
rtc
{
struct
tmp_dir
struct
tmp_dir
{
{
std
::
filesystem
::
path
path
;
CK
::
fs
::
path
path
;
tmp_dir
(
const
std
::
string
&
prefix
=
""
);
tmp_dir
(
const
std
::
string
&
prefix
=
""
);
void
execute
(
const
std
::
string
&
cmd
)
const
;
void
execute
(
const
std
::
string
&
cmd
)
const
;
...
...
codegen/test/rtc/src/compile_kernel.cpp
View file @
6975cb8f
...
@@ -70,9 +70,9 @@ kernel compile_kernel(const std::vector<src_file>& srcs, compile_options options
...
@@ -70,9 +70,9 @@ kernel compile_kernel(const std::vector<src_file>& srcs, compile_options options
for
(
const
auto
&
src
:
srcs
)
for
(
const
auto
&
src
:
srcs
)
{
{
std
::
filesystem
::
path
full_path
=
td
.
path
/
src
.
path
;
CK
::
fs
::
path
full_path
=
td
.
path
/
src
.
path
;
std
::
filesystem
::
path
parent_path
=
full_path
.
parent_path
();
CK
::
fs
::
path
parent_path
=
full_path
.
parent_path
();
std
::
filesystem
::
create_directories
(
parent_path
);
CK
::
fs
::
create_directories
(
parent_path
);
write_string
(
full_path
.
string
(),
src
.
content
);
write_string
(
full_path
.
string
(),
src
.
content
);
if
(
src
.
path
.
extension
().
string
()
==
".cpp"
)
if
(
src
.
path
.
extension
().
string
()
==
".cpp"
)
{
{
...
@@ -86,7 +86,7 @@ kernel compile_kernel(const std::vector<src_file>& srcs, compile_options options
...
@@ -86,7 +86,7 @@ kernel compile_kernel(const std::vector<src_file>& srcs, compile_options options
td
.
execute
(
compiler
()
+
options
.
flags
);
td
.
execute
(
compiler
()
+
options
.
flags
);
auto
out_path
=
td
.
path
/
out
;
auto
out_path
=
td
.
path
/
out
;
if
(
not
std
::
filesystem
::
exists
(
out_path
))
if
(
not
CK
::
fs
::
exists
(
out_path
))
throw
std
::
runtime_error
(
"Output file missing: "
+
out
);
throw
std
::
runtime_error
(
"Output file missing: "
+
out
);
auto
obj
=
read_buffer
(
out_path
.
string
());
auto
obj
=
read_buffer
(
out_path
.
string
());
...
...
codegen/test/rtc/src/tmp_dir.cpp
View file @
6975cb8f
...
@@ -31,10 +31,10 @@ std::string unique_string(const std::string& prefix)
...
@@ -31,10 +31,10 @@ std::string unique_string(const std::string& prefix)
}
}
tmp_dir
::
tmp_dir
(
const
std
::
string
&
prefix
)
tmp_dir
::
tmp_dir
(
const
std
::
string
&
prefix
)
:
path
(
std
::
filesystem
::
temp_directory_path
()
/
:
path
(
CK
::
fs
::
temp_directory_path
()
/
unique_string
(
prefix
.
empty
()
?
"ck-rtc"
:
"ck-rtc-"
+
prefix
))
unique_string
(
prefix
.
empty
()
?
"ck-rtc"
:
"ck-rtc-"
+
prefix
))
{
{
std
::
filesystem
::
create_directories
(
this
->
path
);
CK
::
fs
::
create_directories
(
this
->
path
);
}
}
void
tmp_dir
::
execute
(
const
std
::
string
&
cmd
)
const
void
tmp_dir
::
execute
(
const
std
::
string
&
cmd
)
const
...
@@ -43,6 +43,6 @@ void tmp_dir::execute(const std::string& cmd) const
...
@@ -43,6 +43,6 @@ void tmp_dir::execute(const std::string& cmd) const
std
::
system
(
s
.
c_str
());
std
::
system
(
s
.
c_str
());
}
}
tmp_dir
::~
tmp_dir
()
{
std
::
filesystem
::
remove_all
(
this
->
path
);
}
tmp_dir
::~
tmp_dir
()
{
CK
::
fs
::
remove_all
(
this
->
path
);
}
}
// namespace rtc
}
// namespace rtc
docs/sphinx/requirements.in
View file @
6975cb8f
rocm-docs-core==1.7.2
rocm-docs-core==1.7.2
sphinxcontrib-bibtex==2.6.
2
sphinxcontrib-bibtex==2.6.
3
docs/sphinx/requirements.txt
View file @
6975cb8f
...
@@ -137,7 +137,7 @@ sphinx-notfound-page==1.0.3
...
@@ -137,7 +137,7 @@ sphinx-notfound-page==1.0.3
# via rocm-docs-core
# via rocm-docs-core
sphinxcontrib-applehelp==2.0.0
sphinxcontrib-applehelp==2.0.0
# via sphinx
# via sphinx
sphinxcontrib-bibtex==2.6.
2
sphinxcontrib-bibtex==2.6.
3
# via -r requirements.in
# via -r requirements.in
sphinxcontrib-devhelp==2.0.0
sphinxcontrib-devhelp==2.0.0
# via sphinx
# via sphinx
...
...
example/ck_tile/01_fmha/CMakeLists.txt
View file @
6975cb8f
...
@@ -22,12 +22,20 @@ string(REPLACE ";" "," FMHA_FWD_APIS "${FMHA_FWD_ENABLE_APIS}")
...
@@ -22,12 +22,20 @@ string(REPLACE ";" "," FMHA_FWD_APIS "${FMHA_FWD_ENABLE_APIS}")
execute_process
(
execute_process
(
COMMAND
${
Python3_EXECUTABLE
}
${
CMAKE_CURRENT_LIST_DIR
}
/generate.py
COMMAND
${
Python3_EXECUTABLE
}
${
CMAKE_CURRENT_LIST_DIR
}
/generate.py
--api
${
FMHA_FWD_APIS
}
--list_blobs
${
CMAKE_CURRENT_BINARY_DIR
}
/fwd_blob_list.txt
--api
${
FMHA_FWD_APIS
}
--list_blobs
${
CMAKE_CURRENT_BINARY_DIR
}
/fwd_blob_list.txt
RESULT_VARIABLE ret
)
)
if
(
ret AND NOT ret EQUAL 0
)
message
(
FATAL_ERROR
"CK Tile FMHA FAILED to genrate a list of FWD kernels via Python."
)
endif
()
execute_process
(
execute_process
(
COMMAND
${
Python3_EXECUTABLE
}
${
CMAKE_CURRENT_LIST_DIR
}
/generate.py
COMMAND
${
Python3_EXECUTABLE
}
${
CMAKE_CURRENT_LIST_DIR
}
/generate.py
--api bwd --list_blobs
${
CMAKE_CURRENT_BINARY_DIR
}
/bwd_blob_list.txt --receipt 3
--api bwd --list_blobs
${
CMAKE_CURRENT_BINARY_DIR
}
/bwd_blob_list.txt --receipt 3
RESULT_VARIABLE ret
)
)
if
(
ret AND NOT ret EQUAL 0
)
message
(
FATAL_ERROR
"CK Tile FMHA FAILED to genrate a list of BWD kernels via Python."
)
endif
()
# NOTE: for cmake, the FMHA_FWD_GEN_BLOBS/FMHA_BWD_GEN_BLOBS files must be in the same directory
# NOTE: for cmake, the FMHA_FWD_GEN_BLOBS/FMHA_BWD_GEN_BLOBS files must be in the same directory
# as current cmake list, otherwise will not figure out the dependency properly
# as current cmake list, otherwise will not figure out the dependency properly
...
...
example/ck_tile/03_gemm/gemm_basic.cpp
View file @
6975cb8f
...
@@ -3,7 +3,7 @@
...
@@ -3,7 +3,7 @@
// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
#include "gemm_basic.hpp"
#include "gemm_basic.hpp"
#include
"ck_tile/host.hpp"
#include
<hip/hip_runtime.h>
#include <cstring>
#include <cstring>
#include <iostream>
#include <iostream>
...
@@ -21,7 +21,7 @@ auto create_args(int argc, char* argv[])
...
@@ -21,7 +21,7 @@ auto create_args(int argc, char* argv[])
.
insert
(
"stride_a"
,
"0"
,
"Tensor A stride"
)
.
insert
(
"stride_a"
,
"0"
,
"Tensor A stride"
)
.
insert
(
"stride_b"
,
"0"
,
"Tensor B stride"
)
.
insert
(
"stride_b"
,
"0"
,
"Tensor B stride"
)
.
insert
(
"stride_c"
,
"0"
,
"Tensor C stride"
)
.
insert
(
"stride_c"
,
"0"
,
"Tensor C stride"
)
.
insert
(
"v"
,
"
1
"
,
"
cpu
validation
or not
"
)
.
insert
(
"v"
,
"
2
"
,
"
0. No
validation
, 1. Validation on CPU, 2. Validation on GPU
"
)
.
insert
(
"e"
,
"1e-5"
,
"Absolute error tolerance"
)
.
insert
(
"e"
,
"1e-5"
,
"Absolute error tolerance"
)
.
insert
(
"prec"
,
"fp16"
,
"data type. fp16/bf16/fp8/bf8"
)
.
insert
(
"prec"
,
"fp16"
,
"data type. fp16/bf16/fp8/bf8"
)
.
insert
(
"warmup"
,
"10"
,
"number of iterations before benchmark the kernel"
)
.
insert
(
"warmup"
,
"10"
,
"number of iterations before benchmark the kernel"
)
...
@@ -32,40 +32,21 @@ auto create_args(int argc, char* argv[])
...
@@ -32,40 +32,21 @@ auto create_args(int argc, char* argv[])
return
std
::
make_tuple
(
result
,
arg_parser
);
return
std
::
make_tuple
(
result
,
arg_parser
);
}
}
template
<
typename
LayoutA
,
typename
LayoutB
,
typename
LayoutC
>
template
<
typename
LayoutA
,
typename
LayoutB
,
typename
LayoutC
,
typename
PipelineProblem
,
typename
GemmPipeline
,
typename
GemmShape
>
float
gemm_calc
(
const
gemm_basic_args
&
args
,
const
ck_tile
::
stream_config
&
s
)
float
gemm_calc
(
const
gemm_basic_args
&
args
,
const
ck_tile
::
stream_config
&
s
)
{
{
// ToDo: This will be modified by the codegen code later.
constexpr
ck_tile
::
index_t
M_Tile
=
128
;
constexpr
ck_tile
::
index_t
N_Tile
=
128
;
constexpr
ck_tile
::
index_t
K_Tile
=
32
;
constexpr
ck_tile
::
index_t
M_Warp
=
2
;
constexpr
ck_tile
::
index_t
N_Warp
=
2
;
constexpr
ck_tile
::
index_t
K_Warp
=
1
;
constexpr
ck_tile
::
index_t
M_Warp_Tile
=
32
;
constexpr
ck_tile
::
index_t
N_Warp_Tile
=
32
;
constexpr
ck_tile
::
index_t
K_Warp_Tile
=
8
;
// The kPadA, kPadB, kPadC & kBlockPerCu should also come from the Codegen part.
// The kPadA, kPadB, kPadC & kBlockPerCu should also come from the Codegen part.
constexpr
bool
kPadA
=
true
;
constexpr
bool
kPadA
=
true
;
constexpr
bool
kPadB
=
true
;
constexpr
bool
kPadB
=
true
;
constexpr
bool
kPadC
=
false
;
constexpr
int
kBlockPerCu
=
1
;
constexpr
int
kBlockPerCu
=
1
;
// ===============================================
using
GemmShape
=
ck_tile
::
TileGemmShape
<
ck_tile
::
sequence
<
M_Tile
,
N_Tile
,
K_Tile
>
,
ck_tile
::
sequence
<
M_Warp
,
N_Warp
,
K_Warp
>
,
ck_tile
::
sequence
<
M_Warp_Tile
,
N_Warp_Tile
,
K_Warp_Tile
>>
;
using
TilePartitioner
=
ck_tile
::
GemmTilePartitioner
<
GemmShape
>
;
using
TilePartitioner
=
ck_tile
::
GemmTilePartitioner
<
GemmShape
>
;
using
PipelineProblem
=
ck_tile
::
BlockGemmPipelineProblem
<
ADataType
,
BDataType
,
AccDataType
,
GemmShape
,
kPadA
,
kPadB
,
kPadC
>
;
// The GemmPipeline should also come from the Codegen.
using
GemmPipeline
=
ck_tile
::
BlockGemmPipelineAGmemBGmemCRegV1
<
PipelineProblem
>
;
using
GemmEpilogue
=
ck_tile
::
Default2DEpilogue
<
using
GemmEpilogue
=
ck_tile
::
Default2DEpilogue
<
ck_tile
::
Default2DEpilogueProblem
<
AccDataType
,
CDataType
,
kPadA
,
kPadB
>>
;
ck_tile
::
Default2DEpilogueProblem
<
AccDataType
,
CDataType
,
kPadA
,
kPadB
>>
;
// ToDo: Will add the codegen part to test different pipeline policies in GEMM.
// ToDo: Will add the codegen part to test different pipeline policies in GEMM.
...
@@ -93,7 +74,13 @@ float gemm_calc(const gemm_basic_args& args, const ck_tile::stream_config& s)
...
@@ -93,7 +74,13 @@ float gemm_calc(const gemm_basic_args& args, const ck_tile::stream_config& s)
return
ave_time
;
return
ave_time
;
}
}
template
<
typename
DataType
,
typename
LayoutA
,
typename
LayoutB
,
typename
LayoutC
>
template
<
typename
DataType
,
typename
LayoutA
,
typename
LayoutB
,
typename
LayoutC
,
typename
PipelineProblem
,
typename
GemmPipeline
,
typename
GemmShape
>
float
invoke_gemm
(
ck_tile
::
DeviceMem
&
a_buf
,
float
invoke_gemm
(
ck_tile
::
DeviceMem
&
a_buf
,
ck_tile
::
DeviceMem
&
b_buf
,
ck_tile
::
DeviceMem
&
b_buf
,
ck_tile
::
DeviceMem
&
c_buf
,
ck_tile
::
DeviceMem
&
c_buf
,
...
@@ -155,7 +142,7 @@ float invoke_gemm(ck_tile::DeviceMem& a_buf,
...
@@ -155,7 +142,7 @@ float invoke_gemm(ck_tile::DeviceMem& a_buf,
else
else
{
{
args
.
stride_B
=
[
&
]()
{
args
.
stride_B
=
[
&
]()
{
if
constexpr
(
std
::
is_same_v
<
LayoutB
,
ck_tile
::
tensor_layout
::
gemm
::
Column
Major
>
)
if
constexpr
(
std
::
is_same_v
<
LayoutB
,
ck_tile
::
tensor_layout
::
gemm
::
Row
Major
>
)
{
{
return
N
;
return
N
;
}
}
...
@@ -184,8 +171,8 @@ float invoke_gemm(ck_tile::DeviceMem& a_buf,
...
@@ -184,8 +171,8 @@ float invoke_gemm(ck_tile::DeviceMem& a_buf,
}();
}();
}
}
float
ave_time
=
float
ave_time
=
gemm_calc
<
LayoutA
,
LayoutB
,
LayoutC
,
PipelineProblem
,
GemmPipeline
,
GemmShape
>
(
gemm_calc
<
LayoutA
,
LayoutB
,
LayoutC
>
(
args
,
ck_tile
::
stream_config
{
nullptr
,
true
});
args
,
ck_tile
::
stream_config
{
nullptr
,
true
});
std
::
size_t
num_byte
=
std
::
size_t
num_byte
=
sizeof
(
ADataType
)
*
M
*
K
+
sizeof
(
BDataType
)
*
N
*
K
+
sizeof
(
CDataType
)
*
M
*
N
;
sizeof
(
ADataType
)
*
M
*
K
+
sizeof
(
BDataType
)
*
N
*
K
+
sizeof
(
CDataType
)
*
M
*
N
;
float
gb_per_sec
=
num_byte
/
1.E6
/
ave_time
;
float
gb_per_sec
=
num_byte
/
1.E6
/
ave_time
;
...
@@ -212,7 +199,7 @@ int main(int argc, char* argv[])
...
@@ -212,7 +199,7 @@ int main(int argc, char* argv[])
// The Matrix Multiplication goes with Matrix A (M, K), Matrix B (N, K) = Matrix C (M, N).
// The Matrix Multiplication goes with Matrix A (M, K), Matrix B (N, K) = Matrix C (M, N).
using
matrix_a_layout
=
ck_tile
::
tensor_layout
::
gemm
::
RowMajor
;
using
matrix_a_layout
=
ck_tile
::
tensor_layout
::
gemm
::
RowMajor
;
using
matrix_b_layout
=
ck_tile
::
tensor_layout
::
gemm
::
Row
Major
;
using
matrix_b_layout
=
ck_tile
::
tensor_layout
::
gemm
::
Column
Major
;
using
matrix_c_layout
=
ck_tile
::
tensor_layout
::
gemm
::
RowMajor
;
using
matrix_c_layout
=
ck_tile
::
tensor_layout
::
gemm
::
RowMajor
;
// host verify
// host verify
...
@@ -221,7 +208,7 @@ int main(int argc, char* argv[])
...
@@ -221,7 +208,7 @@ int main(int argc, char* argv[])
?
std
::
vector
<
int
>
{
M
,
K
}
?
std
::
vector
<
int
>
{
M
,
K
}
:
std
::
vector
<
int
>
{
K
,
M
};
:
std
::
vector
<
int
>
{
K
,
M
};
std
::
vector
<
int
>
b_dimensions
=
std
::
vector
<
int
>
b_dimensions
=
(
std
::
is_same_v
<
matrix_b_layout
,
ck_tile
::
tensor_layout
::
gemm
::
Row
Major
>
)
(
std
::
is_same_v
<
matrix_b_layout
,
ck_tile
::
tensor_layout
::
gemm
::
Column
Major
>
)
?
std
::
vector
<
int
>
{
N
,
K
}
?
std
::
vector
<
int
>
{
N
,
K
}
:
std
::
vector
<
int
>
{
K
,
N
};
:
std
::
vector
<
int
>
{
K
,
N
};
std
::
vector
<
int
>
c_dimensions
=
std
::
vector
<
int
>
c_dimensions
=
...
@@ -245,12 +232,52 @@ int main(int argc, char* argv[])
...
@@ -245,12 +232,52 @@ int main(int argc, char* argv[])
a_buf
.
ToDevice
(
a_host
.
data
());
a_buf
.
ToDevice
(
a_host
.
data
());
b_buf
.
ToDevice
(
b_host
.
data
());
b_buf
.
ToDevice
(
b_host
.
data
());
invoke_gemm
<
ck_tile
::
half_t
,
matrix_a_layout
,
matrix_b_layout
,
matrix_c_layout
>
(
// The kPadA, kPadB, kPadC & kBlockPerCu should also come from the Codegen part.
a_buf
,
b_buf
,
c_buf
,
arg_parser
);
constexpr
bool
kPadA
=
true
;
constexpr
bool
kPadB
=
true
;
constexpr
bool
kPadC
=
false
;
// This part comes from the Codegen
constexpr
ck_tile
::
index_t
M_Tile
=
128
;
constexpr
ck_tile
::
index_t
N_Tile
=
128
;
constexpr
ck_tile
::
index_t
K_Tile
=
32
;
constexpr
ck_tile
::
index_t
M_Warp
=
2
;
constexpr
ck_tile
::
index_t
N_Warp
=
2
;
constexpr
ck_tile
::
index_t
K_Warp
=
1
;
constexpr
ck_tile
::
index_t
M_Warp_Tile
=
32
;
constexpr
ck_tile
::
index_t
N_Warp_Tile
=
32
;
constexpr
ck_tile
::
index_t
K_Warp_Tile
=
8
;
using
CodegenGemmShape
=
ck_tile
::
TileGemmShape
<
ck_tile
::
sequence
<
M_Tile
,
N_Tile
,
K_Tile
>
,
ck_tile
::
sequence
<
M_Warp
,
N_Warp
,
K_Warp
>
,
ck_tile
::
sequence
<
M_Warp_Tile
,
N_Warp_Tile
,
K_Warp_Tile
>>
;
using
CodegenPipelineProblem
=
ck_tile
::
BlockGemmPipelineProblem
<
ADataType
,
BDataType
,
AccDataType
,
CodegenGemmShape
,
kPadA
,
kPadB
,
kPadC
>
;
using
CodegenGemmPipeline
=
ck_tile
::
BlockGemmPipelineAGmemBGmemCRegV1
<
CodegenPipelineProblem
>
;
invoke_gemm
<
ck_tile
::
half_t
,
matrix_a_layout
,
matrix_b_layout
,
matrix_c_layout
,
CodegenPipelineProblem
,
CodegenGemmPipeline
,
CodegenGemmShape
>
(
a_buf
,
b_buf
,
c_buf
,
arg_parser
);
c_buf
.
FromDevice
(
c_host_dev
.
data
());
bool
pass
=
true
;
bool
pass
_cpu
=
true
;
if
(
arg_parser
.
get_
bool
(
"v"
))
if
(
arg_parser
.
get_
int
(
"v"
)
==
1
)
{
{
// ToDo: Will Add the Element Op (bias) verification in the future.
// ToDo: Will Add the Element Op (bias) verification in the future.
ck_tile
::
reference_gemm
<
ADataType
,
ck_tile
::
reference_gemm
<
ADataType
,
...
@@ -261,14 +288,71 @@ int main(int argc, char* argv[])
...
@@ -261,14 +288,71 @@ int main(int argc, char* argv[])
matrix_b_layout
,
matrix_b_layout
,
matrix_c_layout
>
(
a_host
,
b_host
,
c_host_ref
);
matrix_c_layout
>
(
a_host
,
b_host
,
c_host_ref
);
c_buf
.
FromDevice
(
c_host_dev
.
data
());
pass_cpu
=
ck_tile
::
check_err
(
c_host_dev
,
c_host_ref
);
std
::
cout
<<
"The CPU veification result is:"
<<
(
pass_cpu
?
"correct"
:
"fail"
)
<<
std
::
flush
;
}
bool
pass_gpu
=
true
;
if
(
arg_parser
.
get_int
(
"v"
)
==
2
)
{
ck_tile
::
index_t
stride_a
=
arg_parser
.
get_int
(
"stride_a"
);
ck_tile
::
index_t
stride_b
=
arg_parser
.
get_int
(
"stride_b"
);
ck_tile
::
index_t
stride_c
=
arg_parser
.
get_int
(
"stride_c"
);
if
(
stride_a
==
0
)
{
if
constexpr
(
std
::
is_same_v
<
matrix_a_layout
,
ck_tile
::
tensor_layout
::
gemm
::
ColumnMajor
>
)
{
stride_a
=
M
;
}
else
{
stride_a
=
K
;
}
}
pass
=
ck_tile
::
check_err
(
c_host_dev
,
c_host_ref
);
if
(
stride_b
==
0
)
{
if
constexpr
(
std
::
is_same_v
<
matrix_b_layout
,
ck_tile
::
tensor_layout
::
gemm
::
RowMajor
>
)
{
stride_b
=
N
;
}
else
{
stride_b
=
K
;
}
}
std
::
cout
<<
"The veification result is:"
<<
(
pass
?
"correct"
:
"fail"
)
<<
std
::
flush
;
if
(
stride_c
==
0
)
{
if
constexpr
(
std
::
is_same_v
<
matrix_c_layout
,
ck_tile
::
tensor_layout
::
gemm
::
ColumnMajor
>
)
{
stride_c
=
M
;
}
else
{
stride_c
=
N
;
}
}
ck_tile
::
HostTensor
<
CDataType
>
c_host_gpu_ref
(
c_dimensions
);
ck_tile
::
DeviceMem
c_gpu_buf
(
c_host_gpu_ref
.
get_element_space_size_in_bytes
());
ck_tile
::
reference_gemm_gpu
<
ADataType
,
BDataType
,
AccDataType
,
CDataType
>
(
a_buf
,
b_buf
,
c_gpu_buf
,
M
,
N
,
K
,
stride_a
,
stride_b
,
stride_c
);
c_buf
.
FromDevice
(
c_host_gpu_ref
.
data
());
pass_gpu
=
ck_tile
::
check_err
(
c_host_dev
,
c_host_gpu_ref
);
std
::
cout
<<
"The GPU veification result is:"
<<
(
pass_gpu
?
"correct"
:
"fail"
)
<<
std
::
flush
;
}
}
std
::
cout
<<
std
::
endl
<<
std
::
flush
;
std
::
cout
<<
std
::
endl
<<
std
::
flush
;
return
!
pass
;
return
!
pass
_gpu
;
}
}
example/ck_tile/03_gemm/gemm_basic.hpp
View file @
6975cb8f
...
@@ -8,6 +8,7 @@
...
@@ -8,6 +8,7 @@
#include "ck_tile/host/kernel_launch.hpp"
#include "ck_tile/host/kernel_launch.hpp"
#include "ck_tile/ops/epilogue.hpp"
#include "ck_tile/ops/epilogue.hpp"
#include "ck_tile/ops/gemm.hpp"
#include "ck_tile/ops/gemm.hpp"
#include "ck_tile/host.hpp"
#include <string>
#include <string>
template
<
typename
DataType
>
template
<
typename
DataType
>
...
...
example/ck_tile/03_gemm/script/run_full_test.sh
0 → 100755
View file @
6975cb8f
#!/bin/bash
#
# in order to run this script you'd first need to build the tile_example_gemm executables in ../build/bin/
#
# run the script as "./run_full_test.sh <tag for your test environment> <branch name> <host name> <gpu_arch>
# input arguments:
# environment tag : a string describing the specifics of your test environment
# branch name : name of the branch in git repo (git status | grep -e 'On branch')
# host name : $hostname
# gpu architecture: e.g., gfx90a, or gfx942, etc.
# get the command line arguments:
export
env_type
=
$1
echo
'Environment type: '
$env_type
export
branch
=
$2
echo
'Branch name: '
$branch
export
host_name
=
$3
echo
'Host name: '
$host_name
export
GPU_arch
=
$4
echo
'GPU_arch: '
$GPU_arch
# run verification tests
example/ck_tile/03_gemm/script/smoke_test.sh
# We do not have a performance benchmark for gemm yet. Will add it in the future.
\ No newline at end of file
example/ck_tile/03_gemm/script/smoke_test.sh
0 → 100755
View file @
6975cb8f
#!/bin/bash
EXE
=
"
$(
find
.
-name
tile_example_gemm_basic
-type
f |
head
-n
1
)
"
KNAME
=
1
export
CK_WARMUP
=
0
export
CK_REPEAT
=
1
COMMON_ARGS
=
'-v=2 -warmup=0 -repeat=1'
run_fp16_tests
()
{
for
batch
in
1 2
;
do
for
m
in
128 1024
;
do
for
n
in
128 2048
;
do
for
k
in
32 64
;
do
$EXE
-b
=
$batch
-m
=
$m
-n
=
$n
-k
=
$k
-stride_a
=
0
-stride_b
=
0
-stride_c
=
0
-e
=
1e-5
-prec
=
fp16
$COMMON_ARGS
if
[
$?
-eq
0
]
;
then
echo
"Success: Test with batch=
$batch
, m=
$m
, n=
$n
, k=
$k
executed successfully."
else
echo
"Error: Test with batch=
$batch
, m=
$m
, n=
$n
, k=
$k
failed to execute properly."
# Optionally, exit or break if you need to halt further execution
# exit 1
fi
done
done
done
done
}
set
-x
run_fp16_tests
set
+x
\ No newline at end of file
include/ck/filesystem.hpp
0 → 100644
View file @
6975cb8f
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
#ifndef GUARD_CK_FILESYSTEM_HPP_
#define GUARD_CK_FILESYSTEM_HPP_
#include <string>
#include <string_view>
// clang-format off
#if defined(CPPCHECK)
#define CK_HAS_FILESYSTEM 1
#define CK_HAS_FILESYSTEM_TS 1
#elif defined(_WIN32)
#if _MSC_VER >= 1920
#define CK_HAS_FILESYSTEM 1
#define CK_HAS_FILESYSTEM_TS 0
#elif _MSC_VER >= 1900
#define CK_HAS_FILESYSTEM 0
#define CK_HAS_FILESYSTEM_TS 1
#else
#define CK_HAS_FILESYSTEM 0
#define CK_HAS_FILESYSTEM_TS 0
#endif
#elif defined(__has_include)
#if __has_include(<filesystem>) && __cplusplus >= 201703L
#define CK_HAS_FILESYSTEM 1
#else
#define CK_HAS_FILESYSTEM 0
#endif
#if __has_include(<experimental/filesystem>) && __cplusplus >= 201103L
#define CK_HAS_FILESYSTEM_TS 1
#else
#define CK_HAS_FILESYSTEM_TS 0
#endif
#else
#define CK_HAS_FILESYSTEM 0
#define CK_HAS_FILESYSTEM_TS 0
#endif
// clang-format on
#if CK_HAS_FILESYSTEM
#include <filesystem>
#elif CK_HAS_FILESYSTEM_TS
#include <experimental/filesystem>
#else
#error "No filesystem include available"
#endif
namespace
CK
{
#if CK_HAS_FILESYSTEM
namespace
fs
=
::
std
::
filesystem
;
#elif CK_HAS_FILESYSTEM_TS
namespace
fs
=
::
std
::
experimental
::
filesystem
;
#endif
}
// namespace CK
inline
std
::
string
operator
+
(
const
std
::
string_view
s
,
const
CK
::
fs
::
path
&
path
)
{
return
path
.
string
().
insert
(
0
,
s
);
}
inline
std
::
string
operator
+
(
const
CK
::
fs
::
path
&
path
,
const
std
::
string_view
s
)
{
return
path
.
string
().
append
(
s
);
}
#define FS_ENUM_PERMS_ALL fs::perms::all
#if CK_HAS_FILESYSTEM_TS
#ifdef __linux__
#include <linux/limits.h>
namespace
CK
{
inline
fs
::
path
weakly_canonical
(
const
fs
::
path
&
path
)
{
std
::
string
result
(
PATH_MAX
,
'\0'
);
std
::
string
p
{
path
.
is_relative
()
?
(
fs
::
current_path
()
/
path
).
string
()
:
path
.
string
()};
char
*
retval
=
realpath
(
p
.
c_str
(),
&
result
[
0
]);
return
(
retval
==
nullptr
)
?
path
:
fs
::
path
{
result
};
}
}
// namespace CK
#else
#error "Not implmeneted!"
#endif
#else
namespace
CK
{
inline
fs
::
path
weakly_canonical
(
const
fs
::
path
&
path
)
{
return
fs
::
weakly_canonical
(
path
);
}
}
// namespace CK
#endif
namespace
CK
{
#ifdef _WIN32
constexpr
std
::
string_view
executable_postfix
{
".exe"
};
constexpr
std
::
string_view
library_prefix
{
""
};
constexpr
std
::
string_view
dynamic_library_postfix
{
".dll"
};
constexpr
std
::
string_view
static_library_postfix
{
".lib"
};
constexpr
std
::
string_view
object_file_postfix
{
".obj"
};
#else
constexpr
std
::
string_view
executable_postfix
{
""
};
constexpr
std
::
string_view
library_prefix
{
"lib"
};
constexpr
std
::
string_view
dynamic_library_postfix
{
".so"
};
constexpr
std
::
string_view
static_library_postfix
{
".a"
};
constexpr
std
::
string_view
object_file_postfix
{
".o"
};
#endif
inline
fs
::
path
make_executable_name
(
const
fs
::
path
&
path
)
{
return
path
.
parent_path
()
/
(
path
.
filename
()
+
executable_postfix
);
}
inline
fs
::
path
make_dynamic_library_name
(
const
fs
::
path
&
path
)
{
return
path
.
parent_path
()
/
(
library_prefix
+
path
.
filename
()
+
dynamic_library_postfix
);
}
inline
fs
::
path
make_object_file_name
(
const
fs
::
path
&
path
)
{
return
path
.
parent_path
()
/
(
path
.
filename
()
+
object_file_postfix
);
}
inline
fs
::
path
make_static_library_name
(
const
fs
::
path
&
path
)
{
return
path
.
parent_path
()
/
(
library_prefix
+
path
.
filename
()
+
static_library_postfix
);
}
struct
FsPathHash
{
std
::
size_t
operator
()(
const
fs
::
path
&
path
)
const
{
return
fs
::
hash_value
(
path
);
}
};
}
// namespace CK
#endif // GUARD_CK_FILESYSTEM_HPP_
include/ck_tile/host/reference/reference_gemm.hpp
View file @
6975cb8f
...
@@ -5,7 +5,7 @@
...
@@ -5,7 +5,7 @@
#include "ck_tile/core.hpp"
#include "ck_tile/core.hpp"
#include "ck_tile/host/host_tensor.hpp"
#include "ck_tile/host/host_tensor.hpp"
#include "ck_tile/ops/common.hpp"
#include "ck_tile/ops/common
/tensor_layout
.hpp"
#include <thread>
#include <thread>
namespace
ck_tile
{
namespace
ck_tile
{
...
@@ -57,4 +57,121 @@ CK_TILE_HOST void reference_gemm(const HostTensor<ADataType>& a_m_k,
...
@@ -57,4 +57,121 @@ CK_TILE_HOST void reference_gemm(const HostTensor<ADataType>& a_m_k,
make_ParallelTensorFunctor
(
f
,
M
)(
std
::
thread
::
hardware_concurrency
());
make_ParallelTensorFunctor
(
f
,
M
)(
std
::
thread
::
hardware_concurrency
());
}
}
template
<
typename
ADataType
,
typename
BDataType
,
typename
AccDataType
,
typename
CDataType
>
__global__
void
naive_gemm_kernel
(
ADataType
*
A
,
BDataType
*
B
,
CDataType
*
C
,
ck_tile
::
index_t
M
,
ck_tile
::
index_t
N
,
ck_tile
::
index_t
K
,
ck_tile
::
index_t
strideA
,
ck_tile
::
index_t
strideB
,
ck_tile
::
index_t
strideC
)
{
int
idx
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
int
row
=
idx
/
N
;
// Compute row index
int
col
=
idx
%
N
;
// Compute column index
if
(
row
<
M
&&
col
<
N
)
{
AccDataType
acc
=
0.0
;
for
(
int
k
=
0
;
k
<
K
;
++
k
)
{
acc
+=
static_cast
<
AccDataType
>
(
A
[
row
*
strideA
+
k
])
*
static_cast
<
AccDataType
>
(
B
[
col
*
strideB
+
k
]);
}
C
[
row
*
strideC
+
col
]
=
acc
;
// Store as AccDataType
}
}
template
<
typename
ADataType
,
typename
BDataType
,
typename
AccDataType
,
typename
CDataType
>
void
reference_gemm_gpu
(
DeviceMem
&
a_device
,
DeviceMem
&
b_device
,
DeviceMem
&
c_device
,
index_t
M
,
index_t
N
,
index_t
K
,
index_t
stride_a
,
index_t
stride_b
,
index_t
stride_c
)
{
ADataType
*
d_A
;
BDataType
*
d_B
;
CDataType
*
d_C
;
hipError_t
errA
=
hipMalloc
(
&
d_A
,
M
*
K
*
sizeof
(
ADataType
));
hipError_t
errB
=
hipMalloc
(
&
d_B
,
N
*
K
*
sizeof
(
BDataType
));
hipError_t
errC
=
hipMalloc
(
&
d_C
,
M
*
N
*
sizeof
(
CDataType
));
if
(
errA
!=
hipSuccess
)
{
std
::
cerr
<<
"Error allocating device memory for A: "
<<
hipGetErrorString
(
errA
)
<<
std
::
endl
;
return
;
// Early exit on error
}
if
(
errB
!=
hipSuccess
)
{
std
::
cerr
<<
"Error allocating device memory for B: "
<<
hipGetErrorString
(
errB
)
<<
std
::
endl
;
return
;
// Early exit on error
}
if
(
errC
!=
hipSuccess
)
{
std
::
cerr
<<
"Error allocating device memory for C: "
<<
hipGetErrorString
(
errC
)
<<
std
::
endl
;
return
;
// Early exit on error
}
errA
=
hipMemcpy
(
d_A
,
a_device
.
GetDeviceBuffer
(),
M
*
K
*
sizeof
(
ADataType
),
hipMemcpyHostToDevice
);
if
(
errA
!=
hipSuccess
)
{
std
::
cerr
<<
"Error copying A to device: "
<<
hipGetErrorString
(
errA
)
<<
std
::
endl
;
}
errB
=
hipMemcpy
(
d_B
,
b_device
.
GetDeviceBuffer
(),
N
*
K
*
sizeof
(
BDataType
),
hipMemcpyHostToDevice
);
if
(
errB
!=
hipSuccess
)
{
std
::
cerr
<<
"Error copying B to device: "
<<
hipGetErrorString
(
errB
)
<<
std
::
endl
;
}
int
totalElements
=
M
*
N
;
int
numThreadsPerBlock
=
256
;
// Common choice for threads per block
int
numBlocks
=
(
totalElements
+
numThreadsPerBlock
-
1
)
/
numThreadsPerBlock
;
naive_gemm_kernel
<
ADataType
,
BDataType
,
AccDataType
,
CDataType
>
<<<
numBlocks
,
numThreadsPerBlock
>>>
(
d_A
,
d_B
,
d_C
,
M
,
N
,
K
,
stride_a
,
stride_b
,
stride_c
);
errC
=
hipMemcpy
(
c_device
.
GetDeviceBuffer
(),
d_C
,
M
*
N
*
sizeof
(
CDataType
),
hipMemcpyDeviceToHost
);
if
(
errC
!=
hipSuccess
)
{
std
::
cerr
<<
"Error copying C to device: "
<<
hipGetErrorString
(
errC
)
<<
std
::
endl
;
}
errA
=
hipFree
(
d_A
);
if
(
errA
!=
hipSuccess
)
{
std
::
cerr
<<
"Error free the A memory: "
<<
hipGetErrorString
(
errA
)
<<
std
::
endl
;
}
errB
=
hipFree
(
d_B
);
if
(
errB
!=
hipSuccess
)
{
std
::
cerr
<<
"Error free the B memory: "
<<
hipGetErrorString
(
errB
)
<<
std
::
endl
;
}
errC
=
hipFree
(
d_C
);
if
(
errC
!=
hipSuccess
)
{
std
::
cerr
<<
"Error free the C memory: "
<<
hipGetErrorString
(
errC
)
<<
std
::
endl
;
}
return
;
}
}
// namespace ck_tile
}
// namespace ck_tile
include/ck_tile/ops/gemm/kernel/gemm_kernel.hpp
View file @
6975cb8f
...
@@ -76,8 +76,7 @@ struct GemmKernel
...
@@ -76,8 +76,7 @@ struct GemmKernel
CK_TILE_DEVICE
void
operator
()(
GemmCommonKargs
kargs
)
const
CK_TILE_DEVICE
void
operator
()(
GemmCommonKargs
kargs
)
const
{
{
const
index_t
i_m
=
TilePartitioner
::
iM
;
const
auto
[
i_m
,
i_n
]
=
TilePartitioner
{}();
const
index_t
i_n
=
TilePartitioner
::
iN
;
// options
// options
const
ADataType
*
a_start
=
static_cast
<
const
ADataType
*>
(
kargs
.
a_ptr
);
const
ADataType
*
a_start
=
static_cast
<
const
ADataType
*>
(
kargs
.
a_ptr
);
const
BDataType
*
b_start
=
static_cast
<
const
BDataType
*>
(
kargs
.
b_ptr
);
const
BDataType
*
b_start
=
static_cast
<
const
BDataType
*>
(
kargs
.
b_ptr
);
...
@@ -104,7 +103,7 @@ struct GemmKernel
...
@@ -104,7 +103,7 @@ struct GemmKernel
}();
}();
auto
b_tensor_view
=
[
&
]()
{
auto
b_tensor_view
=
[
&
]()
{
if
constexpr
(
std
::
is_same_v
<
LayoutB
,
tensor_layout
::
gemm
::
Column
Major
>
)
if
constexpr
(
std
::
is_same_v
<
LayoutB
,
tensor_layout
::
gemm
::
Row
Major
>
)
{
{
return
make_naive_tensor_view
<
address_space_enum
::
global
>
(
return
make_naive_tensor_view
<
address_space_enum
::
global
>
(
b_start
,
b_start
,
...
...
include/ck_tile/ops/gemm/kernel/gemm_tile_partitioner.hpp
View file @
6975cb8f
...
@@ -15,9 +15,6 @@ struct GemmTilePartitioner
...
@@ -15,9 +15,6 @@ struct GemmTilePartitioner
static
constexpr
ck_tile
::
index_t
kN
=
BlockGemmShape
::
kN
;
static
constexpr
ck_tile
::
index_t
kN
=
BlockGemmShape
::
kN
;
static
constexpr
ck_tile
::
index_t
kK
=
BlockGemmShape
::
kK
;
static
constexpr
ck_tile
::
index_t
kK
=
BlockGemmShape
::
kK
;
const
index_t
iM
=
__builtin_amdgcn_readfirstlane
(
i_tile_m
*
kM
);
const
index_t
iN
=
__builtin_amdgcn_readfirstlane
(
i_tile_n
*
kN
);
CK_TILE_HOST
static
constexpr
auto
CK_TILE_HOST
static
constexpr
auto
GridSize
(
ck_tile
::
index_t
M
,
ck_tile
::
index_t
N
,
ck_tile
::
index_t
batch_size
)
GridSize
(
ck_tile
::
index_t
M
,
ck_tile
::
index_t
N
,
ck_tile
::
index_t
batch_size
)
{
{
...
@@ -29,10 +26,9 @@ struct GemmTilePartitioner
...
@@ -29,10 +26,9 @@ struct GemmTilePartitioner
CK_TILE_DEVICE
auto
operator
()()
CK_TILE_DEVICE
auto
operator
()()
{
{
const
index_t
i_GridDimX
=
blockIdx
.
x
;
const
index_t
iM
=
__builtin_amdgcn_readfirstlane
(
blockIdx
.
x
*
kM
);
const
index_t
i_GridDimY
=
blockIdx
.
y
;
const
index_t
iN
=
__builtin_amdgcn_readfirstlane
(
blockIdx
.
y
*
kN
);
const
index_t
i_GridDimZ
=
blockIdx
.
z
;
return
ck_tile
::
make_tuple
(
iM
,
iN
);
return
ck_tile
::
make_tuple
(
i_GridDimX
,
i_GridDimY
,
i_GridDimZ
);
}
}
};
};
}
// namespace ck_tile
}
// namespace ck_tile
library/include/ck/library/tensor_operation_instance/gpu/max_pool_bwd.hpp
View file @
6975cb8f
...
@@ -23,6 +23,10 @@ void add_device_maxpool_bwd_bf16_instances(
...
@@ -23,6 +23,10 @@ void add_device_maxpool_bwd_bf16_instances(
void
add_device_maxpool_bwd_f32_instances
(
void
add_device_maxpool_bwd_f32_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceMaxPoolBwd
<
F32
,
I32
,
F32
>>>&
);
std
::
vector
<
std
::
unique_ptr
<
DeviceMaxPoolBwd
<
F32
,
I32
,
F32
>>>&
);
#endif
#endif
#ifdef CK_ENABLE_FP8
void
add_device_maxpool_bwd_f8_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceMaxPoolBwd
<
F8
,
I32
,
F8
>>>&
);
#endif
#ifdef CK_ENABLE_INT8
#ifdef CK_ENABLE_INT8
void
add_device_maxpool_bwd_int8_instances
(
void
add_device_maxpool_bwd_int8_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceMaxPoolBwd
<
I8
,
I32
,
I8
>>>&
);
std
::
vector
<
std
::
unique_ptr
<
DeviceMaxPoolBwd
<
I8
,
I32
,
I8
>>>&
);
...
@@ -53,6 +57,11 @@ struct DeviceOperationInstanceFactory<
...
@@ -53,6 +57,11 @@ struct DeviceOperationInstanceFactory<
is_same_v
<
IndexDataType
,
I32
>
)
is_same_v
<
IndexDataType
,
I32
>
)
add_device_maxpool_bwd_f32_instances
(
op_ptrs
);
add_device_maxpool_bwd_f32_instances
(
op_ptrs
);
#endif
#endif
#ifdef CK_ENABLE_FP8
else
if
constexpr
(
is_same_v
<
DOutDataType
,
F8
>
&&
is_same_v
<
DInDataType
,
F8
>
&&
is_same_v
<
IndexDataType
,
I32
>
)
add_device_maxpool_bwd_f8_instances
(
op_ptrs
);
#endif
#ifdef CK_ENABLE_INT8
#ifdef CK_ENABLE_INT8
else
if
constexpr
(
is_same_v
<
DOutDataType
,
I8
>
&&
is_same_v
<
DInDataType
,
I8
>
&&
else
if
constexpr
(
is_same_v
<
DOutDataType
,
I8
>
&&
is_same_v
<
DInDataType
,
I8
>
&&
is_same_v
<
IndexDataType
,
I32
>
)
is_same_v
<
IndexDataType
,
I32
>
)
...
...
library/include/ck/library/tensor_operation_instance/gpu/pool2d_fwd.hpp
View file @
6975cb8f
...
@@ -67,6 +67,36 @@ void add_device_pool2d_fwd_nhwc_index_f32_instances(
...
@@ -67,6 +67,36 @@ void add_device_pool2d_fwd_nhwc_index_f32_instances(
std
::
vector
<
std
::
unique_ptr
<
std
::
vector
<
std
::
unique_ptr
<
DevicePoolFwd
<
InOutRank
,
WindowRank
,
F32
,
F32
,
I32
,
NHWC
,
NHWC
,
MaxOp
,
true
>>>&
);
DevicePoolFwd
<
InOutRank
,
WindowRank
,
F32
,
F32
,
I32
,
NHWC
,
NHWC
,
MaxOp
,
true
>>>&
);
#endif
#endif
#ifdef CK_ENABLE_INT8
// I8
void
add_device_pool2d_fwd_nhwc_i8_instances
(
std
::
vector
<
std
::
unique_ptr
<
DevicePoolFwd
<
InOutRank
,
WindowRank
,
I8
,
I8
,
I32
,
NHWC
,
NHWC
,
MaxOp
,
false
>>>&
);
void
add_device_pool2d_fwd_nhwc_i8_instances
(
std
::
vector
<
std
::
unique_ptr
<
DevicePoolFwd
<
InOutRank
,
WindowRank
,
I8
,
I8
,
I32
,
NHWC
,
NHWC
,
AvgOp
,
false
>>>&
);
// I8 - return index
void
add_device_pool2d_fwd_nhwc_index_i8_instances
(
std
::
vector
<
std
::
unique_ptr
<
DevicePoolFwd
<
InOutRank
,
WindowRank
,
I8
,
I8
,
I32
,
NHWC
,
NHWC
,
MaxOp
,
true
>>>&
);
#endif
#ifdef CK_ENABLE_FP8
// F8
void
add_device_pool2d_fwd_nhwc_f8_instances
(
std
::
vector
<
std
::
unique_ptr
<
DevicePoolFwd
<
InOutRank
,
WindowRank
,
F8
,
F8
,
I32
,
NHWC
,
NHWC
,
MaxOp
,
false
>>>&
);
void
add_device_pool2d_fwd_nhwc_f8_instances
(
std
::
vector
<
std
::
unique_ptr
<
DevicePoolFwd
<
InOutRank
,
WindowRank
,
F8
,
F8
,
I32
,
NHWC
,
NHWC
,
AvgOp
,
false
>>>&
);
// F8 - return index
void
add_device_pool2d_fwd_nhwc_index_f8_instances
(
std
::
vector
<
std
::
unique_ptr
<
DevicePoolFwd
<
InOutRank
,
WindowRank
,
F8
,
F8
,
I32
,
NHWC
,
NHWC
,
MaxOp
,
true
>>>&
);
#endif
template
<
typename
InDataType
,
template
<
typename
InDataType
,
typename
OutDataType
,
typename
OutDataType
,
typename
IndexDataType
,
typename
IndexDataType
,
...
@@ -140,6 +170,34 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DevicePoolFw
...
@@ -140,6 +170,34 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DevicePoolFw
add_device_pool2d_fwd_nhwc_f32_instances
(
op_ptrs
);
add_device_pool2d_fwd_nhwc_f32_instances
(
op_ptrs
);
}
}
}
}
#endif
#ifdef CK_ENABLE_INT8
else
if
constexpr
(
is_same_v
<
InDataType
,
I8
>
&&
is_same_v
<
OutDataType
,
I8
>
&&
is_same_v
<
IndexDataType
,
I32
>
)
{
if
constexpr
(
OutputIndex
&&
ReduceOpId
==
MaxOp
)
{
add_device_pool2d_fwd_nhwc_index_i8_instances
(
op_ptrs
);
}
else
{
add_device_pool2d_fwd_nhwc_i8_instances
(
op_ptrs
);
}
}
#endif
#ifdef CK_ENABLE_FP8
else
if
constexpr
(
is_same_v
<
InDataType
,
F8
>
&&
is_same_v
<
OutDataType
,
F8
>
&&
is_same_v
<
IndexDataType
,
I32
>
)
{
if
constexpr
(
OutputIndex
&&
ReduceOpId
==
MaxOp
)
{
add_device_pool2d_fwd_nhwc_index_f8_instances
(
op_ptrs
);
}
else
{
add_device_pool2d_fwd_nhwc_f8_instances
(
op_ptrs
);
}
}
#endif
#endif
}
}
...
...
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment