Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
3550cefe
Unverified
Commit
3550cefe
authored
Jun 16, 2023
by
rocking
Committed by
GitHub
Jun 16, 2023
Browse files
Merge branch 'develop' into max-pool-bwd
parents
508d7a59
027e46ee
Changes
145
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
608 additions
and
41 deletions
+608
-41
Dockerfile
Dockerfile
+7
-7
Jenkinsfile
Jenkinsfile
+2
-2
client_example/02_gemm_add_add_fastgelu/CMakeLists.txt
client_example/02_gemm_add_add_fastgelu/CMakeLists.txt
+14
-0
client_example/02_gemm_add_add_fastgelu/gemm_add_add_fastgelu_generic.cpp
...2_gemm_add_add_fastgelu/gemm_add_add_fastgelu_generic.cpp
+176
-0
client_example/02_gemm_add_add_fastgelu/gemm_add_fastgelu.cpp
...nt_example/02_gemm_add_add_fastgelu/gemm_add_fastgelu.cpp
+1
-1
client_example/02_gemm_add_add_fastgelu/gemm_add_fastgelu_generic.cpp
...le/02_gemm_add_add_fastgelu/gemm_add_fastgelu_generic.cpp
+169
-0
client_example/02_gemm_add_add_fastgelu/gemm_fastgelu.cpp
client_example/02_gemm_add_add_fastgelu/gemm_fastgelu.cpp
+1
-1
client_example/02_gemm_add_add_fastgelu/gemm_fastgelu_generic.cpp
...xample/02_gemm_add_add_fastgelu/gemm_fastgelu_generic.cpp
+162
-0
client_example/16_convnd_fwd/common.hpp
client_example/16_convnd_fwd/common.hpp
+0
-4
client_example/16_convnd_fwd/conv3d_fwd_fp16.cpp
client_example/16_convnd_fwd/conv3d_fwd_fp16.cpp
+2
-2
client_example/16_convnd_fwd/conv3d_fwd_fp32.cpp
client_example/16_convnd_fwd/conv3d_fwd_fp32.cpp
+2
-2
example/02_gemm_bilinear/CMakeLists.txt
example/02_gemm_bilinear/CMakeLists.txt
+15
-4
example/03_gemm_bias_relu/CMakeLists.txt
example/03_gemm_bias_relu/CMakeLists.txt
+7
-2
example/04_gemm_add_add_fastgelu/CMakeLists.txt
example/04_gemm_add_add_fastgelu/CMakeLists.txt
+7
-2
example/09_convnd_fwd/CMakeLists.txt
example/09_convnd_fwd/CMakeLists.txt
+8
-3
example/10_convnd_fwd_multiple_d_multiple_reduce/CMakeLists.txt
...e/10_convnd_fwd_multiple_d_multiple_reduce/CMakeLists.txt
+7
-2
example/14_gemm_quantization/CMakeLists.txt
example/14_gemm_quantization/CMakeLists.txt
+7
-2
example/16_gemm_multi_d_multi_reduces/CMakeLists.txt
example/16_gemm_multi_d_multi_reduces/CMakeLists.txt
+7
-2
example/17_convnd_bwd_data/CMakeLists.txt
example/17_convnd_bwd_data/CMakeLists.txt
+7
-2
example/18_batched_gemm_reduce/CMakeLists.txt
example/18_batched_gemm_reduce/CMakeLists.txt
+7
-3
No files found.
Dockerfile
View file @
3550cefe
...
...
@@ -12,20 +12,20 @@ RUN useradd -rm -d /home/jenkins -s /bin/bash -u 1004 jenkins
RUN
chmod
1777 /tmp
RUN
apt-get update
RUN
apt-get
install
-y
--allow-unauthenticated
apt-utils wget gnupg2 curl
RUN
--mount
=
type
=
ssh
if
[
"
$ROCMVERSION
"
!=
"5.6"
]
;
then
\
RUN if
[
"
$ROCMVERSION
"
!=
"5.6"
]
;
then
\
wget
-qO
- http://repo.radeon.com/rocm/rocm.gpg.key | apt-key add -
&&
\
sh
-c
"echo deb [arch=amd64]
$DEB_ROCM_REPO
ubuntu main > /etc/apt/sources.list.d/rocm.list"
;
\
elif
[
"
$ROCMVERSION
"
=
"5.6"
]
&&
[
"
$compiler_version
"
=
""
]
;
then
\
sh
-c
"wget http://artifactory-cdn.amd.com/artifactory/list/amdgpu-deb/amd-nonfree-radeon_20.04-1_all.deb"
&&
\
apt update
&&
apt-get
install
-y
./amd-nonfree-radeon_20.04-1_all.deb
&&
\
amdgpu-repo
--amdgpu-build
=
1567752
--rocm-build
=
compute-rocm-dkms-no-npi-hipclang/11914
;
\
elif
[
"
$ROCMVERSION
"
=
"5.6"
]
&&
[
"
$compiler_version
"
=
"rc3"
]
;
then
\
amdgpu-repo
--amdgpu-build
=
1567752
--rocm-build
=
compute-rocm-dkms-no-npi-hipclang/11914
&&
\
amdgpu-install
-y
--usecase
=
rocm
--no-dkms
;
\
elif
[
"
$ROCMVERSION
"
=
"5.6"
]
&&
[
"
$compiler_version
"
=
"rc3"
]
||
[
"
$compiler_version
"
=
"amd-stg-open"
]
;
then
\
sh
-c
"wget http://artifactory-cdn.amd.com/artifactory/list/amdgpu-deb/amdgpu-install-internal_5.6-20.04-1_all.deb"
&&
\
apt update
&&
apt-get
install
-y
./amdgpu-install-internal_5.6-20.04-1_all.deb
&&
\
sh
-c
'echo deb [arch=amd64 trusted=yes] http://compute-artifactory.amd.com/artifactory/list/rocm-release-archive-20.04-deb/ 5.6 rel-45 > /etc/apt/sources.list.d/rocm-build.list'
&&
\
amdgpu-repo
--amdgpu-build
=
1602498
;
\
amdgpu-repo
--amdgpu-build
=
1602498
&&
amdgpu-install
-y
--usecase
=
rocm
--no-dkms
;
\
fi
RUN
amdgpu-install
-y
--usecase
=
rocm
--no-dkms
RUN
wget
--no-check-certificate
-qO
- https://apt.kitware.com/keys/kitware-archive-latest.asc 2>/dev/null | apt-key add -
RUN
sh
-c
"echo deb http://mirrors.kernel.org/ubuntu focal main universe | tee -a /etc/apt/sources.list"
...
...
@@ -105,7 +105,7 @@ ENV compiler_commit=$compiler_commit
RUN
sh
-c
"echo compiler version = '
$compiler_version
'"
RUN
sh
-c
"echo compiler commit = '
$compiler_commit
'"
RUN
--mount
=
type
=
ssh
if
[
"
$compiler_version
"
=
"amd-stg-open"
]
&&
[
"
$compiler_commit
"
=
""
]
;
then
\
RUN if
[
"
$compiler_version
"
=
"amd-stg-open"
]
&&
[
"
$compiler_commit
"
=
""
]
;
then
\
git clone
-b
"
$compiler_version
"
https://github.com/RadeonOpenCompute/llvm-project.git
&&
\
cd
llvm-project
&&
mkdir
build
&&
cd
build
&&
\
cmake
-DCMAKE_INSTALL_PREFIX
=
/opt/rocm/llvm
-DCMAKE_BUILD_TYPE
=
Release
-DLLVM_ENABLE_ASSERTIONS
=
1
-DLLVM_TARGETS_TO_BUILD
=
"AMDGPU;X86"
-DLLVM_ENABLE_PROJECTS
=
"clang;lld;compiler-rt"
../llvm
&&
\
...
...
@@ -113,7 +113,7 @@ RUN --mount=type=ssh if [ "$compiler_version" = "amd-stg-open" ] && [ "$compiler
else
echo
"using the release compiler"
;
\
fi
RUN
--mount
=
type
=
ssh
if
[
"
$compiler_version
"
=
"amd-stg-open"
]
&&
[
"
$compiler_commit
"
!=
""
]
;
then
\
RUN if
[
"
$compiler_version
"
=
"amd-stg-open"
]
&&
[
"
$compiler_commit
"
!=
""
]
;
then
\
git clone
-b
"
$compiler_version
"
https://github.com/RadeonOpenCompute/llvm-project.git
&&
\
cd
llvm-project
&&
git checkout
"
$compiler_commit
"
&&
echo
"checking out commit
$compiler_commit
"
&&
mkdir
build
&&
cd
build
&&
\
cmake
-DCMAKE_INSTALL_PREFIX
=
/opt/rocm/llvm
-DCMAKE_BUILD_TYPE
=
Release
-DLLVM_ENABLE_ASSERTIONS
=
1
-DLLVM_TARGETS_TO_BUILD
=
"AMDGPU;X86"
-DLLVM_ENABLE_PROJECTS
=
"clang;lld;compiler-rt"
../llvm
&&
\
...
...
Jenkinsfile
View file @
3550cefe
...
...
@@ -695,8 +695,8 @@ pipeline {
}
agent
{
label
rocmnode
(
"gfx908 || gfx90a"
)
}
environment
{
setup_args
=
""" -DCMAKE_INSTALL_PREFIX=../install -DGPU_TARGETS="gfx908;gfx90a;gfx940" """
execute_args
=
""" cd ../client_example && rm -rf build && mkdir build && cd build && cmake -D CMAKE_PREFIX_PATH="${env.WORKSPACE}/install;/opt/rocm" -DGPU_TARGETS="gfx908;gfx90a;gfx940" -D CMAKE_CXX_COMPILER="${build_compiler()}" .. && make -j """
setup_args
=
""" -DCMAKE_INSTALL_PREFIX=../install -DGPU_TARGETS="gfx908;gfx90a;gfx940
;gfx941;gfx942
" """
execute_args
=
""" cd ../client_example && rm -rf build && mkdir build && cd build && cmake -D CMAKE_PREFIX_PATH="${env.WORKSPACE}/install;/opt/rocm" -DGPU_TARGETS="gfx908;gfx90a;gfx940
;gfx941;gfx942
" -D CMAKE_CXX_COMPILER="${build_compiler()}" .. && make -j """
}
steps
{
Build_CK_and_Reboot
(
setup_args:
setup_args
,
config_targets:
"install"
,
no_reboot:
true
,
build_type:
'Release'
,
execute_cmd:
execute_args
,
prefixpath:
'/usr/local'
)
...
...
client_example/02_gemm_add_add_fastgelu/CMakeLists.txt
View file @
3550cefe
...
...
@@ -11,3 +11,17 @@ target_link_libraries(client_gemm_fastgelu PRIVATE composable_kernel::device_ope
add_dependencies
(
client_gemm_fastgelu_examples client_gemm_add_add_fastgelu client_gemm_add_fastgelu
client_gemm_fastgelu
)
add_custom_target
(
client_gemm_fastgelu_generic_examples
)
add_executable
(
client_gemm_add_add_fastgelu_generic gemm_add_add_fastgelu_generic.cpp
)
target_link_libraries
(
client_gemm_add_add_fastgelu_generic PRIVATE composable_kernel::device_operations
)
add_executable
(
client_gemm_add_fastgelu_generic gemm_add_fastgelu_generic.cpp
)
target_link_libraries
(
client_gemm_add_fastgelu_generic PRIVATE composable_kernel::device_operations
)
add_executable
(
client_gemm_fastgelu_generic gemm_fastgelu_generic.cpp
)
target_link_libraries
(
client_gemm_fastgelu_generic PRIVATE composable_kernel::device_operations
)
add_dependencies
(
client_gemm_fastgelu_generic_examples client_gemm_add_add_fastgelu_generic
client_gemm_add_fastgelu_generic client_gemm_fastgelu_generic
)
client_example/02_gemm_add_add_fastgelu/gemm_add_add_fastgelu_generic.cpp
0 → 100644
View file @
3550cefe
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include <iomanip>
#include <vector>
#include <iostream>
#include <stdexcept>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/device_gemm_multiple_d.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/tensor_operation_instance/gpu/gemm_add_add_fastgelu.hpp"
using
F16
=
ck
::
half_t
;
using
F32
=
float
;
using
Row
=
ck
::
tensor_layout
::
gemm
::
RowMajor
;
using
Col
=
ck
::
tensor_layout
::
gemm
::
ColumnMajor
;
using
PassThrough
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
AddAddFastGelu
=
ck
::
tensor_operation
::
element_wise
::
AddAddFastGelu
;
using
AElementOp
=
PassThrough
;
using
BElementOp
=
PassThrough
;
using
CDEElementOp
=
AddAddFastGelu
;
using
ADataType
=
F16
;
using
BDataType
=
F16
;
using
D0DataType
=
F16
;
using
D1DataType
=
F16
;
using
EDataType
=
F16
;
using
ALayout
=
Row
;
using
BLayout
=
Col
;
using
D0Layout
=
Row
;
using
D1Layout
=
Row
;
using
ELayout
=
Row
;
struct
SimpleDeviceMem
{
SimpleDeviceMem
()
=
delete
;
SimpleDeviceMem
(
std
::
size_t
mem_size
)
:
p_mem_
{}
{
(
void
)
hipMalloc
(
static_cast
<
void
**>
(
&
p_mem_
),
mem_size
);
}
void
*
GetDeviceBuffer
()
{
return
p_mem_
;
}
~
SimpleDeviceMem
()
{
(
void
)
hipFree
(
p_mem_
);
}
void
*
p_mem_
;
};
int
main
(
int
argc
,
char
*
argv
[])
{
// GEMM shape
ck
::
index_t
M
=
3840
;
ck
::
index_t
N
=
4096
;
ck
::
index_t
K
=
4096
;
ck
::
index_t
StrideA
=
4096
;
ck
::
index_t
StrideB
=
4096
;
ck
::
index_t
StrideD0
=
0
;
ck
::
index_t
StrideD1
=
4096
;
ck
::
index_t
StrideE
=
4096
;
if
(
argc
==
1
)
{
// use default case
}
else
if
(
argc
==
9
)
{
M
=
std
::
stoi
(
argv
[
1
]);
N
=
std
::
stoi
(
argv
[
2
]);
K
=
std
::
stoi
(
argv
[
3
]);
StrideA
=
std
::
stoi
(
argv
[
4
]);
StrideB
=
std
::
stoi
(
argv
[
5
]);
StrideD0
=
std
::
stoi
(
argv
[
6
]);
StrideD1
=
std
::
stoi
(
argv
[
7
]);
StrideE
=
std
::
stoi
(
argv
[
8
]);
}
else
{
printf
(
"arg1 to 8: M, N, K, StrideA, StrideB, StrideD0, StrideD1, StrideE
\n
"
);
exit
(
0
);
}
auto
f_matrix_space_size
=
[](
std
::
size_t
nRow
,
std
::
size_t
nCol
,
std
::
size_t
stride
,
auto
layout
)
{
using
Layout
=
decltype
(
layout
);
if
constexpr
(
std
::
is_same
<
Layout
,
ck
::
tensor_layout
::
gemm
::
RowMajor
>::
value
)
{
return
(
nRow
-
1
)
*
stride
+
nCol
;
}
else
{
return
(
nCol
-
1
)
*
stride
+
nRow
;
}
};
SimpleDeviceMem
a_device_buf
(
sizeof
(
ADataType
)
*
f_matrix_space_size
(
M
,
K
,
StrideA
,
ALayout
{}));
SimpleDeviceMem
b_device_buf
(
sizeof
(
BDataType
)
*
f_matrix_space_size
(
K
,
N
,
StrideB
,
BLayout
{}));
SimpleDeviceMem
d0_m_n_device_buf
(
sizeof
(
D0DataType
)
*
f_matrix_space_size
(
M
,
N
,
StrideD0
,
D0Layout
{}));
SimpleDeviceMem
d1_m_n_device_buf
(
sizeof
(
D1DataType
)
*
f_matrix_space_size
(
M
,
N
,
StrideD1
,
D1Layout
{}));
SimpleDeviceMem
e_device_buf
(
sizeof
(
EDataType
)
*
f_matrix_space_size
(
M
,
N
,
StrideE
,
ELayout
{}));
using
DeviceOp
=
ck
::
tensor_operation
::
device
::
DeviceGemmMultipleD
<
ALayout
,
BLayout
,
ck
::
Tuple
<
D0Layout
,
D1Layout
>
,
ELayout
,
ADataType
,
BDataType
,
ck
::
Tuple
<
D0DataType
,
D1DataType
>
,
EDataType
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
,
ck
::
tensor_operation
::
element_wise
::
AddAddFastGelu
>
;
// get device op instances
const
auto
op_ptrs
=
ck
::
tensor_operation
::
device
::
instance
::
DeviceOperationInstanceFactory
<
DeviceOp
>::
GetInstances
();
std
::
cout
<<
"found "
<<
op_ptrs
.
size
()
<<
" instances"
<<
std
::
endl
;
const
auto
a_element_op
=
AElementOp
{};
const
auto
b_element_op
=
BElementOp
{};
const
auto
cde_element_op
=
CDEElementOp
{};
// get generic instance
auto
&
op_ptr
=
op_ptrs
[
0
];
std
::
cout
<<
"Run the generic instance without timing: "
<<
op_ptr
->
GetTypeString
()
<<
std
::
endl
;
// run the generic instance
auto
argument_ptr
=
op_ptr
->
MakeArgumentPointer
(
a_device_buf
.
GetDeviceBuffer
(),
b_device_buf
.
GetDeviceBuffer
(),
std
::
array
<
const
void
*
,
2
>
{
d0_m_n_device_buf
.
GetDeviceBuffer
(),
d1_m_n_device_buf
.
GetDeviceBuffer
()},
e_device_buf
.
GetDeviceBuffer
(),
M
,
N
,
K
,
StrideA
,
StrideB
,
std
::
array
<
ck
::
index_t
,
2
>
{
StrideD0
,
StrideD1
},
StrideE
,
a_element_op
,
b_element_op
,
cde_element_op
);
auto
invoker_ptr
=
op_ptr
->
MakeInvokerPointer
();
if
(
op_ptr
->
IsSupportedArgument
(
argument_ptr
.
get
()))
{
invoker_ptr
->
Run
(
argument_ptr
.
get
(),
StreamConfig
{
nullptr
,
false
});
}
else
{
throw
std
::
runtime_error
(
"Generic instance should be suitable for various input lengths/strides"
);
}
std
::
cout
<<
"Done"
<<
std
::
endl
;
return
0
;
}
client_example/02_gemm_add_add_fastgelu/gemm_add_fastgelu.cpp
View file @
3550cefe
...
...
@@ -76,7 +76,7 @@ int main(int argc, char* argv[])
StrideA
=
std
::
stoi
(
argv
[
4
]);
StrideB
=
std
::
stoi
(
argv
[
5
]);
StrideD0
=
std
::
stoi
(
argv
[
6
]);
StrideE
=
std
::
stoi
(
argv
[
8
]);
StrideE
=
std
::
stoi
(
argv
[
7
]);
}
else
{
...
...
client_example/02_gemm_add_add_fastgelu/gemm_add_fastgelu_generic.cpp
0 → 100644
View file @
3550cefe
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include <iomanip>
#include <vector>
#include <iostream>
#include <stdexcept>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/device_gemm_multiple_d.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/tensor_operation_instance/gpu/gemm_add_fastgelu.hpp"
using
F16
=
ck
::
half_t
;
using
F32
=
float
;
using
Row
=
ck
::
tensor_layout
::
gemm
::
RowMajor
;
using
Col
=
ck
::
tensor_layout
::
gemm
::
ColumnMajor
;
using
PassThrough
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
AddFastGelu
=
ck
::
tensor_operation
::
element_wise
::
AddFastGelu
;
using
AElementOp
=
PassThrough
;
using
BElementOp
=
PassThrough
;
using
CDEElementOp
=
AddFastGelu
;
using
ADataType
=
F16
;
using
BDataType
=
F16
;
using
D0DataType
=
F16
;
using
EDataType
=
F16
;
using
ALayout
=
Row
;
using
BLayout
=
Col
;
using
D0Layout
=
Row
;
using
ELayout
=
Row
;
struct
SimpleDeviceMem
{
SimpleDeviceMem
()
=
delete
;
SimpleDeviceMem
(
std
::
size_t
mem_size
)
:
p_mem_
{}
{
(
void
)
hipMalloc
(
static_cast
<
void
**>
(
&
p_mem_
),
mem_size
);
}
void
*
GetDeviceBuffer
()
{
return
p_mem_
;
}
~
SimpleDeviceMem
()
{
(
void
)
hipFree
(
p_mem_
);
}
void
*
p_mem_
;
};
int
main
(
int
argc
,
char
*
argv
[])
{
// GEMM shape
ck
::
index_t
M
=
3840
;
ck
::
index_t
N
=
4096
;
ck
::
index_t
K
=
4096
;
ck
::
index_t
StrideA
=
4096
;
ck
::
index_t
StrideB
=
4096
;
ck
::
index_t
StrideD0
=
0
;
ck
::
index_t
StrideE
=
4096
;
if
(
argc
==
1
)
{
// use default case
}
else
if
(
argc
==
8
)
{
M
=
std
::
stoi
(
argv
[
1
]);
N
=
std
::
stoi
(
argv
[
2
]);
K
=
std
::
stoi
(
argv
[
3
]);
StrideA
=
std
::
stoi
(
argv
[
4
]);
StrideB
=
std
::
stoi
(
argv
[
5
]);
StrideD0
=
std
::
stoi
(
argv
[
6
]);
StrideE
=
std
::
stoi
(
argv
[
7
]);
}
else
{
printf
(
"arg1 to 7: M, N, K, StrideA, StrideB, StrideD0, StrideE
\n
"
);
exit
(
0
);
}
auto
f_matrix_space_size
=
[](
std
::
size_t
nRow
,
std
::
size_t
nCol
,
std
::
size_t
stride
,
auto
layout
)
{
using
Layout
=
decltype
(
layout
);
if
constexpr
(
std
::
is_same
<
Layout
,
ck
::
tensor_layout
::
gemm
::
RowMajor
>::
value
)
{
return
(
nRow
-
1
)
*
stride
+
nCol
;
}
else
{
return
(
nCol
-
1
)
*
stride
+
nRow
;
}
};
SimpleDeviceMem
a_device_buf
(
sizeof
(
ADataType
)
*
f_matrix_space_size
(
M
,
K
,
StrideA
,
ALayout
{}));
SimpleDeviceMem
b_device_buf
(
sizeof
(
BDataType
)
*
f_matrix_space_size
(
K
,
N
,
StrideB
,
BLayout
{}));
SimpleDeviceMem
d0_m_n_device_buf
(
sizeof
(
D0DataType
)
*
f_matrix_space_size
(
M
,
N
,
StrideD0
,
D0Layout
{}));
SimpleDeviceMem
e_device_buf
(
sizeof
(
EDataType
)
*
f_matrix_space_size
(
M
,
N
,
StrideE
,
ELayout
{}));
using
DeviceOp
=
ck
::
tensor_operation
::
device
::
DeviceGemmMultipleD
<
ALayout
,
BLayout
,
ck
::
Tuple
<
D0Layout
>
,
ELayout
,
ADataType
,
BDataType
,
ck
::
Tuple
<
D0DataType
>
,
EDataType
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
,
ck
::
tensor_operation
::
element_wise
::
AddFastGelu
>
;
// get device op instances
const
auto
op_ptrs
=
ck
::
tensor_operation
::
device
::
instance
::
DeviceOperationInstanceFactory
<
DeviceOp
>::
GetInstances
();
std
::
cout
<<
"found "
<<
op_ptrs
.
size
()
<<
" instances"
<<
std
::
endl
;
const
auto
a_element_op
=
AElementOp
{};
const
auto
b_element_op
=
BElementOp
{};
const
auto
cde_element_op
=
CDEElementOp
{};
// get generic instance
auto
&
op_ptr
=
op_ptrs
[
0
];
std
::
cout
<<
"Run the generic instance without timing: "
<<
op_ptr
->
GetTypeString
()
<<
std
::
endl
;
// run the generic instance
auto
argument_ptr
=
op_ptr
->
MakeArgumentPointer
(
a_device_buf
.
GetDeviceBuffer
(),
b_device_buf
.
GetDeviceBuffer
(),
std
::
array
<
const
void
*
,
1
>
{
d0_m_n_device_buf
.
GetDeviceBuffer
()},
e_device_buf
.
GetDeviceBuffer
(),
M
,
N
,
K
,
StrideA
,
StrideB
,
std
::
array
<
ck
::
index_t
,
1
>
{
StrideD0
},
StrideE
,
a_element_op
,
b_element_op
,
cde_element_op
);
auto
invoker_ptr
=
op_ptr
->
MakeInvokerPointer
();
if
(
op_ptr
->
IsSupportedArgument
(
argument_ptr
.
get
()))
{
invoker_ptr
->
Run
(
argument_ptr
.
get
(),
StreamConfig
{
nullptr
,
false
});
}
else
{
throw
std
::
runtime_error
(
"Generic instance should be suitable for various input lengths/strides"
);
}
std
::
cout
<<
"Done"
<<
std
::
endl
;
return
0
;
}
client_example/02_gemm_add_add_fastgelu/gemm_fastgelu.cpp
View file @
3550cefe
...
...
@@ -72,7 +72,7 @@ int main(int argc, char* argv[])
StrideA
=
std
::
stoi
(
argv
[
4
]);
StrideB
=
std
::
stoi
(
argv
[
5
]);
StrideE
=
std
::
stoi
(
argv
[
8
]);
StrideE
=
std
::
stoi
(
argv
[
6
]);
}
else
{
...
...
client_example/02_gemm_add_add_fastgelu/gemm_fastgelu_generic.cpp
0 → 100644
View file @
3550cefe
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include <iomanip>
#include <vector>
#include <iostream>
#include <stdexcept>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/device_gemm_multiple_d.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/tensor_operation_instance/gpu/gemm_fastgelu.hpp"
using
F16
=
ck
::
half_t
;
using
F32
=
float
;
using
Row
=
ck
::
tensor_layout
::
gemm
::
RowMajor
;
using
Col
=
ck
::
tensor_layout
::
gemm
::
ColumnMajor
;
using
PassThrough
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
FastGelu
=
ck
::
tensor_operation
::
element_wise
::
FastGelu
;
using
AElementOp
=
PassThrough
;
using
BElementOp
=
PassThrough
;
using
CDEElementOp
=
FastGelu
;
using
ADataType
=
F16
;
using
BDataType
=
F16
;
using
EDataType
=
F16
;
using
ALayout
=
Row
;
using
BLayout
=
Col
;
using
ELayout
=
Row
;
struct
SimpleDeviceMem
{
SimpleDeviceMem
()
=
delete
;
SimpleDeviceMem
(
std
::
size_t
mem_size
)
:
p_mem_
{}
{
(
void
)
hipMalloc
(
static_cast
<
void
**>
(
&
p_mem_
),
mem_size
);
}
void
*
GetDeviceBuffer
()
{
return
p_mem_
;
}
~
SimpleDeviceMem
()
{
(
void
)
hipFree
(
p_mem_
);
}
void
*
p_mem_
;
};
int
main
(
int
argc
,
char
*
argv
[])
{
// GEMM shape
ck
::
index_t
M
=
3840
;
ck
::
index_t
N
=
4096
;
ck
::
index_t
K
=
4096
;
ck
::
index_t
StrideA
=
4096
;
ck
::
index_t
StrideB
=
4096
;
ck
::
index_t
StrideE
=
4096
;
if
(
argc
==
1
)
{
// use default case
}
else
if
(
argc
==
7
)
{
M
=
std
::
stoi
(
argv
[
1
]);
N
=
std
::
stoi
(
argv
[
2
]);
K
=
std
::
stoi
(
argv
[
3
]);
StrideA
=
std
::
stoi
(
argv
[
4
]);
StrideB
=
std
::
stoi
(
argv
[
5
]);
StrideE
=
std
::
stoi
(
argv
[
6
]);
}
else
{
printf
(
"arg1 to 6: M, N, K, StrideA, StrideB, StrideE
\n
"
);
exit
(
0
);
}
auto
f_matrix_space_size
=
[](
std
::
size_t
nRow
,
std
::
size_t
nCol
,
std
::
size_t
stride
,
auto
layout
)
{
using
Layout
=
decltype
(
layout
);
if
constexpr
(
std
::
is_same
<
Layout
,
ck
::
tensor_layout
::
gemm
::
RowMajor
>::
value
)
{
return
(
nRow
-
1
)
*
stride
+
nCol
;
}
else
{
return
(
nCol
-
1
)
*
stride
+
nRow
;
}
};
SimpleDeviceMem
a_device_buf
(
sizeof
(
ADataType
)
*
f_matrix_space_size
(
M
,
K
,
StrideA
,
ALayout
{}));
SimpleDeviceMem
b_device_buf
(
sizeof
(
BDataType
)
*
f_matrix_space_size
(
K
,
N
,
StrideB
,
BLayout
{}));
SimpleDeviceMem
e_device_buf
(
sizeof
(
EDataType
)
*
f_matrix_space_size
(
M
,
N
,
StrideE
,
ELayout
{}));
using
DeviceOp
=
ck
::
tensor_operation
::
device
::
DeviceGemmMultipleD
<
ALayout
,
BLayout
,
ck
::
Tuple
<>
,
ELayout
,
ADataType
,
BDataType
,
ck
::
Tuple
<>
,
EDataType
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
,
ck
::
tensor_operation
::
element_wise
::
FastGelu
>
;
// get device op instances
const
auto
op_ptrs
=
ck
::
tensor_operation
::
device
::
instance
::
DeviceOperationInstanceFactory
<
DeviceOp
>::
GetInstances
();
std
::
cout
<<
"found "
<<
op_ptrs
.
size
()
<<
" instances"
<<
std
::
endl
;
const
auto
a_element_op
=
AElementOp
{};
const
auto
b_element_op
=
BElementOp
{};
const
auto
cde_element_op
=
CDEElementOp
{};
// get generic instance
auto
&
op_ptr
=
op_ptrs
[
0
];
std
::
cout
<<
"Run the generic instance without timing: "
<<
op_ptr
->
GetTypeString
()
<<
std
::
endl
;
// run the generic instance
auto
argument_ptr
=
op_ptr
->
MakeArgumentPointer
(
a_device_buf
.
GetDeviceBuffer
(),
b_device_buf
.
GetDeviceBuffer
(),
{},
e_device_buf
.
GetDeviceBuffer
(),
M
,
N
,
K
,
StrideA
,
StrideB
,
{},
StrideE
,
a_element_op
,
b_element_op
,
cde_element_op
);
auto
invoker_ptr
=
op_ptr
->
MakeInvokerPointer
();
if
(
op_ptr
->
IsSupportedArgument
(
argument_ptr
.
get
()))
{
invoker_ptr
->
Run
(
argument_ptr
.
get
(),
StreamConfig
{
nullptr
,
false
});
}
else
{
throw
std
::
runtime_error
(
"Generic instance should be suitable for various input lengths/strides"
);
}
std
::
cout
<<
"Done"
<<
std
::
endl
;
return
0
;
}
client_example/16_convnd_fwd/common.hpp
View file @
3550cefe
...
...
@@ -141,14 +141,10 @@ bool run_grouped_conv_fwd(std::array<ck::index_t, NumDimSpatial + NumNonSpatialD
std
::
next
(
rbegin
(
in_strides
)),
std
::
next
(
rbegin
(
in_strides
),
NumDimSpatial
+
1
));
std
::
rotate
(
std
::
next
(
rbegin
(
wei_lengths
)),
std
::
next
(
rbegin
(
wei_lengths
),
2
),
rend
(
wei_lengths
));
std
::
rotate
(
rbegin
(
wei_lengths
),
std
::
next
(
rbegin
(
wei_lengths
)),
std
::
next
(
rbegin
(
wei_lengths
),
NumDimSpatial
+
1
));
std
::
rotate
(
std
::
next
(
rbegin
(
wei_strides
)),
std
::
next
(
rbegin
(
wei_strides
),
2
),
rend
(
wei_strides
));
std
::
rotate
(
rbegin
(
wei_strides
),
std
::
next
(
rbegin
(
wei_strides
)),
std
::
next
(
rbegin
(
wei_strides
),
NumDimSpatial
+
1
));
...
...
client_example/16_convnd_fwd/conv3d_fwd_fp16.cpp
View file @
3550cefe
...
...
@@ -11,7 +11,7 @@ using WeiDataType = ck::half_t;
using
OutDataType
=
ck
::
half_t
;
using
InLayout
=
ck
::
tensor_layout
::
convolution
::
NDHWGC
;
using
WeiLayout
=
ck
::
tensor_layout
::
convolution
::
KZYX
G
C
;
using
WeiLayout
=
ck
::
tensor_layout
::
convolution
::
G
KZYXC
;
using
OutLayout
=
ck
::
tensor_layout
::
convolution
::
NDHWGK
;
static
constexpr
ck
::
index_t
NumDimSpatial
=
3
;
...
...
@@ -38,7 +38,7 @@ int main()
InLayout
,
WeiLayout
,
OutLayout
>
(
{
N
,
Di
,
Hi
,
Wi
,
G
,
C
},
{
K
,
Z
,
Y
,
X
,
G
,
C
},
{
N
,
Do
,
Ho
,
Wo
,
G
,
K
})
{
N
,
Di
,
Hi
,
Wi
,
G
,
C
},
{
G
,
K
,
Z
,
Y
,
X
,
C
},
{
N
,
Do
,
Ho
,
Wo
,
G
,
K
})
?
EXIT_SUCCESS
:
EXIT_FAILURE
;
}
client_example/16_convnd_fwd/conv3d_fwd_fp32.cpp
View file @
3550cefe
...
...
@@ -11,7 +11,7 @@ using WeiDataType = float;
using
OutDataType
=
float
;
using
InLayout
=
ck
::
tensor_layout
::
convolution
::
NDHWGC
;
using
WeiLayout
=
ck
::
tensor_layout
::
convolution
::
KZYX
G
C
;
using
WeiLayout
=
ck
::
tensor_layout
::
convolution
::
G
KZYXC
;
using
OutLayout
=
ck
::
tensor_layout
::
convolution
::
NDHWGK
;
static
constexpr
ck
::
index_t
NumDimSpatial
=
3
;
...
...
@@ -38,7 +38,7 @@ int main()
InLayout
,
WeiLayout
,
OutLayout
>
(
{
N
,
Di
,
Hi
,
Wi
,
G
,
C
},
{
K
,
Z
,
Y
,
X
,
G
,
C
},
{
N
,
Do
,
Ho
,
Wo
,
G
,
K
})
{
N
,
Di
,
Hi
,
Wi
,
G
,
C
},
{
G
,
K
,
Z
,
Y
,
X
,
C
},
{
N
,
Do
,
Ho
,
Wo
,
G
,
K
})
?
EXIT_SUCCESS
:
EXIT_FAILURE
;
}
example/02_gemm_bilinear/CMakeLists.txt
View file @
3550cefe
if
(
GPU_TARGETS MATCHES
"gfx1100"
OR GPU_TARGETS MATCHES
"gfx1101"
OR GPU_TARGETS MATCHES
"gfx1102"
)
list
(
APPEND gpu_list1 gfx1100 gfx1101 gfx1102
)
list
(
APPEND gpu_list2 gfx908 gfx90a gfx940 gfx941 gfx942
)
set
(
target 0
)
foreach
(
gpu IN LISTS GPU_TARGETS
)
if
(
gpu IN_LIST gpu_list1 AND target EQUAL 0
)
add_example_executable
(
example_gemm_bilinear_wmma_fp16 gemm_bilinear_wmma_fp16.cpp
)
endif
()
if
(
GPU_TARGETS MATCHES
"gfx908"
OR GPU_TARGETS MATCHES
"gfx90a"
OR GPU_TARGETS MATCHES
"gfx940"
)
set
(
target 1
)
endif
()
endforeach
()
set
(
target 0
)
foreach
(
gpu IN LISTS GPU_TARGETS
)
if
(
gpu IN_LIST gpu_list2 AND target EQUAL 0
)
add_example_executable
(
example_gemm_bilinear_xdl_fp16 gemm_bilinear_xdl_fp16.cpp
)
endif
()
set
(
target 1
)
endif
()
endforeach
()
example/03_gemm_bias_relu/CMakeLists.txt
View file @
3550cefe
if
(
GPU_TARGETS MATCHES
"gfx908"
OR GPU_TARGETS MATCHES
"gfx90a"
OR GPU_TARGETS MATCHES
"gfx940"
)
list
(
APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942
)
set
(
target 0
)
foreach
(
gpu IN LISTS GPU_TARGETS
)
if
(
gpu IN_LIST gpu_list AND target EQUAL 0
)
add_example_executable
(
example_gemm_bias_relu_xdl_fp16 gemm_bias_relu_xdl_fp16.cpp
)
endif
()
\ No newline at end of file
set
(
target 1
)
endif
()
endforeach
()
example/04_gemm_add_add_fastgelu/CMakeLists.txt
View file @
3550cefe
if
(
GPU_TARGETS MATCHES
"gfx908"
OR GPU_TARGETS MATCHES
"gfx90a"
OR GPU_TARGETS MATCHES
"gfx940"
)
list
(
APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942
)
set
(
target 0
)
foreach
(
gpu IN LISTS GPU_TARGETS
)
if
(
gpu IN_LIST gpu_list AND target EQUAL 0
)
add_custom_target
(
example_gemm_add_add_fastgelu_xdl
)
add_example_executable
(
example_gemm_add_add_fastgelu_xdl_bf16 gemm_add_add_fastgelu_xdl_bf16.cpp
)
...
...
@@ -16,4 +19,6 @@ if(GPU_TARGETS MATCHES "gfx908" OR GPU_TARGETS MATCHES "gfx90a" OR GPU_TARGETS M
add_dependencies
(
example_gemm_add_add_fastgelu_xdl example_gemm_add_add_fastgelu_xdl_int4
)
endif
(
USE_BITINT_EXTENSION_INT4
)
add_dependencies
(
example_gemm_add_add_fastgelu_xdl example_gemm_add_add_fastgelu_xdl_int8
)
endif
()
\ No newline at end of file
set
(
target 1
)
endif
()
endforeach
()
\ No newline at end of file
example/09_convnd_fwd/CMakeLists.txt
View file @
3550cefe
if
(
GPU_TARGETS MATCHES
"gfx908"
OR GPU_TARGETS MATCHES
"gfx90a"
OR GPU_TARGETS MATCHES
"gfx940"
)
list
(
APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942
)
set
(
target 0
)
foreach
(
gpu IN LISTS GPU_TARGETS
)
if
(
gpu IN_LIST gpu_list AND target EQUAL 0
)
add_example_executable
(
example_convnd_fwd_xdl_fp32 convnd_fwd_xdl_fp32.cpp
)
add_example_executable
(
example_convnd_fwd_xdl_fp16 convnd_fwd_xdl_fp16.cpp
)
add_example_executable
(
example_convnd_fwd_xdl_bf16 convnd_fwd_xdl_bf16.cpp
)
add_example_executable
(
example_convnd_fwd_xdl_int8 convnd_fwd_xdl_int8.cpp
)
# FIXME: re-enable this exampe as test when SWDEV-335738 is fixed
add_example_executable_no_testing
(
example_convnd_fwd_xdl_fp64 convnd_fwd_xdl_fp64.cpp
)
endif
()
set
(
target 1
)
endif
()
endforeach
()
add_example_executable
(
example_convnd_fwd_dl_fp16 convnd_fwd_dl_fp16.cpp
)
add_example_executable
(
example_convnd_fwd_dl_fp32 convnd_fwd_dl_fp32.cpp
)
add_example_executable
(
example_convnd_fwd_dl_int8 convnd_fwd_dl_int8.cpp
)
example/10_convnd_fwd_multiple_d_multiple_reduce/CMakeLists.txt
View file @
3550cefe
if
(
GPU_TARGETS MATCHES
"gfx908"
OR GPU_TARGETS MATCHES
"gfx90a"
OR GPU_TARGETS MATCHES
"gfx940"
)
list
(
APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942
)
set
(
target 0
)
foreach
(
gpu IN LISTS GPU_TARGETS
)
if
(
gpu IN_LIST gpu_list AND target EQUAL 0
)
add_custom_target
(
example_convnd_fwd_reduce_xdl
)
add_example_executable
(
example_convnd_fwd_max_xdl_int8 convnd_fwd_max_xdl_int8.cpp
)
add_example_executable_no_testing
(
example_convnd_fwd_max_xdl_bf16 convnd_fwd_max_xdl_bf16.cpp
)
...
...
@@ -12,4 +15,6 @@ if(GPU_TARGETS MATCHES "gfx908" OR GPU_TARGETS MATCHES "gfx90a" OR GPU_TARGETS M
add_example_executable
(
example_convnd_fwd_max_xdl_int4 convnd_fwd_max_xdl_int4.cpp
)
add_dependencies
(
example_convnd_fwd_reduce_xdl example_convnd_fwd_max_xdl_int4
)
endif
(
USE_BITINT_EXTENSION_INT4
)
endif
()
\ No newline at end of file
set
(
target 1
)
endif
()
endforeach
()
\ No newline at end of file
example/14_gemm_quantization/CMakeLists.txt
View file @
3550cefe
...
...
@@ -2,7 +2,12 @@
add_example_executable
(
example_gemm_dl_quantization_int8 gemm_dl_quantization_int8.cpp
)
# xdlops
if
(
GPU_TARGETS MATCHES
"gfx908"
OR GPU_TARGETS MATCHES
"gfx90a"
OR GPU_TARGETS MATCHES
"gfx940"
)
list
(
APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942
)
set
(
target 0
)
foreach
(
gpu IN LISTS GPU_TARGETS
)
if
(
gpu IN_LIST gpu_list AND target EQUAL 0
)
add_example_executable
(
example_gemm_xdl_bias_relu_quantization_int8 gemm_xdl_bias_relu_quantization_int8.cpp
)
add_example_executable
(
example_gemm_xdl_quantization_int8 gemm_xdl_quantization_int8.cpp
)
endif
()
\ No newline at end of file
set
(
target 1
)
endif
()
endforeach
()
\ No newline at end of file
example/16_gemm_multi_d_multi_reduces/CMakeLists.txt
View file @
3550cefe
if
(
GPU_TARGETS MATCHES
"gfx908"
OR GPU_TARGETS MATCHES
"gfx90a"
OR GPU_TARGETS MATCHES
"gfx940"
)
list
(
APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942
)
set
(
target 0
)
foreach
(
gpu IN LISTS GPU_TARGETS
)
if
(
gpu IN_LIST gpu_list AND target EQUAL 0
)
add_custom_target
(
example_gemm_reduce_xdl
)
add_custom_target
(
example_gemm_reduce_xdl_max
)
add_custom_target
(
example_gemm_reduce_xdl_mean_meansquare
)
...
...
@@ -39,4 +42,6 @@ if(GPU_TARGETS MATCHES "gfx908" OR GPU_TARGETS MATCHES "gfx90a" OR GPU_TARGETS M
add_example_executable
(
example_gemm_max_xdl_int4 gemm_max_xdl_int4.cpp
)
add_dependencies
(
example_gemm_reduce_xdl_max example_gemm_max_xdl_int4
)
endif
()
endif
()
set
(
target 1
)
endif
()
endforeach
()
example/17_convnd_bwd_data/CMakeLists.txt
View file @
3550cefe
if
(
GPU_TARGETS MATCHES
"gfx908"
OR GPU_TARGETS MATCHES
"gfx90a"
OR GPU_TARGETS MATCHES
"gfx940"
)
list
(
APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942
)
set
(
target 0
)
foreach
(
gpu IN LISTS GPU_TARGETS
)
if
(
gpu IN_LIST gpu_list AND target EQUAL 0
)
add_example_executable
(
example_convnd_bwd_data_xdl_fp16 convnd_bwd_data_xdl_fp16.cpp
)
target_link_libraries
(
example_convnd_bwd_data_xdl_fp16 PRIVATE utility
)
endif
()
set
(
target 1
)
endif
()
endforeach
()
add_example_executable
(
example_convnd_bwd_data_dl_fp16 convnd_bwd_data_dl_fp16.cpp
)
target_link_libraries
(
example_convnd_bwd_data_dl_fp16 PRIVATE utility
)
example/18_batched_gemm_reduce/CMakeLists.txt
View file @
3550cefe
if
(
GPU_TARGETS MATCHES
"gfx908"
OR GPU_TARGETS MATCHES
"gfx90a"
OR GPU_TARGETS MATCHES
"gfx940"
)
list
(
APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942
)
set
(
target 0
)
foreach
(
gpu IN LISTS GPU_TARGETS
)
if
(
gpu IN_LIST gpu_list AND target EQUAL 0
)
add_example_executable
(
example_batched_gemm_reduce_xdl_fp16 batched_gemm_reduce_xdl_fp16.cpp
)
endif
()
set
(
target 1
)
endif
()
endforeach
()
Prev
1
2
3
4
5
…
8
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment