Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
91e611e8
Unverified
Commit
91e611e8
authored
Mar 09, 2023
by
zjing14
Committed by
GitHub
Mar 09, 2023
Browse files
Merge branch 'develop' into lwpck-586
parents
69413c0f
0ccecc7c
Changes
13
Hide whitespace changes
Inline
Side-by-side
Showing
13 changed files
with
27 additions
and
26 deletions
+27
-26
CHANGELOG.md
CHANGELOG.md
+2
-1
Jenkinsfile
Jenkinsfile
+2
-2
example/01_gemm/CMakeLists.txt
example/01_gemm/CMakeLists.txt
+1
-1
example/02_gemm_bilinear/CMakeLists.txt
example/02_gemm_bilinear/CMakeLists.txt
+1
-1
example/29_batched_gemm_bias_e_permute/CMakeLists.txt
example/29_batched_gemm_bias_e_permute/CMakeLists.txt
+1
-1
example/30_grouped_conv_fwd_multiple_d/CMakeLists.txt
example/30_grouped_conv_fwd_multiple_d/CMakeLists.txt
+1
-1
include/ck/ck.hpp
include/ck/ck.hpp
+2
-10
include/ck/tensor_operation/gpu/device/impl/device_batched_contraction_multiple_d_wmma_cshuffle.hpp
...l/device_batched_contraction_multiple_d_wmma_cshuffle.hpp
+2
-1
include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_wmma_cshuffle.hpp
.../gpu/device/impl/device_gemm_multiple_d_wmma_cshuffle.hpp
+2
-1
include/ck/tensor_operation/gpu/device/impl/device_gemm_wmma.hpp
.../ck/tensor_operation/gpu/device/impl/device_gemm_wmma.hpp
+2
-1
include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_wmma_cshuffle.hpp
...impl/device_grouped_conv_fwd_multiple_d_wmma_cshuffle.hpp
+2
-1
include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_wmma_cshuffle.hpp
...ation/gpu/grid/gridwise_gemm_multiple_d_wmma_cshuffle.hpp
+7
-4
include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma.hpp
include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma.hpp
+2
-1
No files found.
CHANGELOG.md
View file @
91e611e8
...
@@ -2,7 +2,7 @@
...
@@ -2,7 +2,7 @@
Full documentation for Composable Kernel is not yet available.
Full documentation for Composable Kernel is not yet available.
## CK 0.
1.1
for ROCm 5.5.0
## CK 0.
2.0
for ROCm 5.5.0
### Fixed
### Fixed
-
Fixed a bug in 6-dimensional kernels (#555).
-
Fixed a bug in 6-dimensional kernels (#555).
...
@@ -12,6 +12,7 @@ Full documentation for Composable Kernel is not yet available.
...
@@ -12,6 +12,7 @@ Full documentation for Composable Kernel is not yet available.
-
Improve proformance of normalization kernel
-
Improve proformance of normalization kernel
### Added
### Added
-
Added support on NAVI3x.
-
Added user tutorial (#563).
-
Added user tutorial (#563).
-
Added more instances for irregular GEMM sizes (#560).
-
Added more instances for irregular GEMM sizes (#560).
-
Added inter-wave consumer-producer programming model for GEMM kernels (#310).
-
Added inter-wave consumer-producer programming model for GEMM kernels (#310).
...
...
Jenkinsfile
View file @
91e611e8
...
@@ -684,8 +684,8 @@ pipeline {
...
@@ -684,8 +684,8 @@ pipeline {
}
}
agent
{
label
rocmnode
(
"navi21"
)
}
agent
{
label
rocmnode
(
"navi21"
)
}
environment
{
environment
{
setup_args
=
""" -DCMAKE_INSTALL_PREFIX=../install
-DGPU_TARGETS="gfx1030"
"""
setup_args
=
""" -DCMAKE_INSTALL_PREFIX=../install """
execute_args
=
""" cd ../client_example && rm -rf build && mkdir build && cd build && cmake -D CMAKE_PREFIX_PATH="${env.WORKSPACE}/install;/opt/rocm" -D
GPU_TARGETS="gfx1030
" -D CMAKE_CXX_COMPILER="${build_compiler()}" .. && make -j """
execute_args
=
""" cd ../client_example && rm -rf build && mkdir build && cd build && cmake -D CMAKE_PREFIX_PATH="${env.WORKSPACE}/install;/opt/rocm" -D
CMAKE_CXX_FLAGS=" --offload-arch=gfx1030 --offload-arch=gfx1100 --offload-arch=gfx1101 --offload-arch=gfx1102
" -D CMAKE_CXX_COMPILER="${build_compiler()}" .. && make -j """
}
}
steps
{
steps
{
...
...
example/01_gemm/CMakeLists.txt
View file @
91e611e8
...
@@ -38,7 +38,7 @@ add_example_executable_no_testing(example_gemm_xdl_fp64 gemm_xdl_fp64.cpp)
...
@@ -38,7 +38,7 @@ add_example_executable_no_testing(example_gemm_xdl_fp64 gemm_xdl_fp64.cpp)
add_dependencies
(
example_gemm_xdl example_gemm_xdl_skip_b_lds_fp16
)
add_dependencies
(
example_gemm_xdl example_gemm_xdl_skip_b_lds_fp16
)
add_dependencies
(
example_gemm_xdl example_gemm_xdl_fp64
)
add_dependencies
(
example_gemm_xdl example_gemm_xdl_fp64
)
if
(
GPU_TARGETS MATCHES
"gfx1100"
)
if
(
GPU_TARGETS MATCHES
"gfx1100"
OR GPU_TARGETS MATCHES
"gfx1101"
OR GPU_TARGETS MATCHES
"gfx1102"
)
add_custom_target
(
example_gemm_wmma
)
add_custom_target
(
example_gemm_wmma
)
add_example_executable
(
example_gemm_wmma_fp16 gemm_wmma_fp16.cpp
)
add_example_executable
(
example_gemm_wmma_fp16 gemm_wmma_fp16.cpp
)
add_dependencies
(
example_gemm_wmma example_gemm_wmma_fp16
)
add_dependencies
(
example_gemm_wmma example_gemm_wmma_fp16
)
...
...
example/02_gemm_bilinear/CMakeLists.txt
View file @
91e611e8
add_example_executable
(
example_gemm_bilinear_xdl_fp16 gemm_bilinear_xdl_fp16.cpp
)
add_example_executable
(
example_gemm_bilinear_xdl_fp16 gemm_bilinear_xdl_fp16.cpp
)
if
(
GPU_TARGETS MATCHES
"gfx1100"
)
if
(
GPU_TARGETS MATCHES
"gfx1100"
OR GPU_TARGETS MATCHES
"gfx1101"
OR GPU_TARGETS MATCHES
"gfx1102"
)
add_example_executable
(
example_gemm_bilinear_wmma_fp16 gemm_bilinear_wmma_fp16.cpp
)
add_example_executable
(
example_gemm_bilinear_wmma_fp16 gemm_bilinear_wmma_fp16.cpp
)
endif
()
endif
()
example/29_batched_gemm_bias_e_permute/CMakeLists.txt
View file @
91e611e8
add_example_executable
(
example_batched_gemm_bias_e_permute_xdl_fp16 batched_gemm_bias_e_permute_xdl_fp16.cpp
)
add_example_executable
(
example_batched_gemm_bias_e_permute_xdl_fp16 batched_gemm_bias_e_permute_xdl_fp16.cpp
)
if
(
GPU_TARGETS MATCHES
"gfx1100"
)
if
(
GPU_TARGETS MATCHES
"gfx1100"
OR GPU_TARGETS MATCHES
"gfx1101"
OR GPU_TARGETS MATCHES
"gfx1102"
)
add_example_executable
(
example_batched_gemm_bias_e_permute_wmma_fp16 batched_gemm_bias_e_permute_wmma_fp16.cpp
)
add_example_executable
(
example_batched_gemm_bias_e_permute_wmma_fp16 batched_gemm_bias_e_permute_wmma_fp16.cpp
)
endif
()
endif
()
example/30_grouped_conv_fwd_multiple_d/CMakeLists.txt
View file @
91e611e8
...
@@ -16,7 +16,7 @@ if(USE_BITINT_EXTENSION_INT4)
...
@@ -16,7 +16,7 @@ if(USE_BITINT_EXTENSION_INT4)
add_dependencies
(
example_grouped_conv_fwd_multiple_d example_grouped_conv_fwd_bias_relu_add_xdl_int4
)
add_dependencies
(
example_grouped_conv_fwd_multiple_d example_grouped_conv_fwd_bias_relu_add_xdl_int4
)
endif
()
# USE_BITINT_EXTENSION_INT4
endif
()
# USE_BITINT_EXTENSION_INT4
if
(
GPU_TARGETS MATCHES
"gfx1100"
)
if
(
GPU_TARGETS MATCHES
"gfx1100"
OR GPU_TARGETS MATCHES
"gfx1101"
OR GPU_TARGETS MATCHES
"gfx1102"
)
add_example_executable
(
example_grouped_conv_fwd_bias_relu_add_wmma_fp16 grouped_conv_fwd_bias_relu_add_wmma_fp16.cpp
)
add_example_executable
(
example_grouped_conv_fwd_bias_relu_add_wmma_fp16 grouped_conv_fwd_bias_relu_add_wmma_fp16.cpp
)
endif
()
endif
()
...
...
include/ck/ck.hpp
View file @
91e611e8
...
@@ -27,14 +27,6 @@
...
@@ -27,14 +27,6 @@
#define CK_WAVELET_MIN_BLOCK_PER_CU 2
#define CK_WAVELET_MIN_BLOCK_PER_CU 2
#endif
#endif
// check GPU target
#ifdef __HIP_DEVICE_COMPILE__
#if !(defined(__gfx803__) || defined(__gfx900__) || defined(__gfx906__) || defined(__gfx908__) || \
defined(__gfx90a__) || defined(__gfx1030__) || defined(__gfx1100__))
#error Not supported target
#endif
#endif
// buffer resource
// buffer resource
#ifndef __HIP_DEVICE_COMPILE__ // for host code
#ifndef __HIP_DEVICE_COMPILE__ // for host code
#define CK_BUFFER_RESOURCE_3RD_DWORD -1
#define CK_BUFFER_RESOURCE_3RD_DWORD -1
...
@@ -43,7 +35,7 @@
...
@@ -43,7 +35,7 @@
#define CK_BUFFER_RESOURCE_3RD_DWORD 0x00020000
#define CK_BUFFER_RESOURCE_3RD_DWORD 0x00020000
#elif defined(__gfx1030__) // for GPU code
#elif defined(__gfx1030__) // for GPU code
#define CK_BUFFER_RESOURCE_3RD_DWORD 0x31014000
#define CK_BUFFER_RESOURCE_3RD_DWORD 0x31014000
#elif defined(__gfx1100__) // for GPU code
#elif defined(__gfx1100__)
|| defined(__gfx1101__) || defined(__gfx1102__)
// for GPU code
#define CK_BUFFER_RESOURCE_3RD_DWORD 0x10020000
#define CK_BUFFER_RESOURCE_3RD_DWORD 0x10020000
#endif
#endif
...
@@ -72,7 +64,7 @@
...
@@ -72,7 +64,7 @@
// WMMA instruction
// WMMA instruction
#ifndef __HIP_DEVICE_COMPILE__ // for host code
#ifndef __HIP_DEVICE_COMPILE__ // for host code
#define CK_USE_AMD_WMMA
#define CK_USE_AMD_WMMA
#elif defined(__gfx1100__) // for GPU code
#elif defined(__gfx1100__)
|| defined(__gfx1101__) || defined(__gfx1102__)
// for GPU code
#define CK_USE_AMD_WMMA
#define CK_USE_AMD_WMMA
#endif
#endif
...
...
include/ck/tensor_operation/gpu/device/impl/device_batched_contraction_multiple_d_wmma_cshuffle.hpp
View file @
91e611e8
...
@@ -770,7 +770,8 @@ struct DeviceBatchedContractionMultipleD_Wmma_CShuffle
...
@@ -770,7 +770,8 @@ struct DeviceBatchedContractionMultipleD_Wmma_CShuffle
static
bool
IsSupportedArgument
(
const
Argument
&
arg
)
static
bool
IsSupportedArgument
(
const
Argument
&
arg
)
{
{
if
(
ck
::
get_device_name
()
==
"gfx1100"
)
if
(
ck
::
get_device_name
()
==
"gfx1100"
||
ck
::
get_device_name
()
==
"gfx1101"
||
ck
::
get_device_name
()
==
"gfx1102"
)
{
{
if
constexpr
(
!
(
is_same_v
<
AccDataType
,
float
>
||
is_same_v
<
AccDataType
,
int32_t
>
))
if
constexpr
(
!
(
is_same_v
<
AccDataType
,
float
>
||
is_same_v
<
AccDataType
,
int32_t
>
))
{
{
...
...
include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_wmma_cshuffle.hpp
View file @
91e611e8
...
@@ -476,7 +476,8 @@ struct DeviceGemmMultipleD_Wmma_CShuffle : public DeviceGemmMultipleD<ALayout,
...
@@ -476,7 +476,8 @@ struct DeviceGemmMultipleD_Wmma_CShuffle : public DeviceGemmMultipleD<ALayout,
static
bool
IsSupportedArgument
(
const
Argument
&
arg
)
static
bool
IsSupportedArgument
(
const
Argument
&
arg
)
{
{
if
(
ck
::
get_device_name
()
==
"gfx1100"
)
if
(
ck
::
get_device_name
()
==
"gfx1100"
||
ck
::
get_device_name
()
==
"gfx1101"
||
ck
::
get_device_name
()
==
"gfx1102"
)
{
{
if
constexpr
(
!
(
is_same_v
<
AccDataType
,
float
>
||
is_same_v
<
AccDataType
,
int32_t
>
))
if
constexpr
(
!
(
is_same_v
<
AccDataType
,
float
>
||
is_same_v
<
AccDataType
,
int32_t
>
))
{
{
...
...
include/ck/tensor_operation/gpu/device/impl/device_gemm_wmma.hpp
View file @
91e611e8
...
@@ -404,7 +404,8 @@ struct DeviceGemmWmma_CShuffle : public DeviceGemm<ALayout,
...
@@ -404,7 +404,8 @@ struct DeviceGemmWmma_CShuffle : public DeviceGemm<ALayout,
static
bool
IsSupportedArgument
(
const
Argument
&
arg
)
static
bool
IsSupportedArgument
(
const
Argument
&
arg
)
{
{
if
(
ck
::
get_device_name
()
==
"gfx1100"
)
if
(
ck
::
get_device_name
()
==
"gfx1100"
||
ck
::
get_device_name
()
==
"gfx1101"
||
ck
::
get_device_name
()
==
"gfx1102"
)
{
{
if
constexpr
(
!
(
is_same_v
<
AccDataType
,
float
>
||
is_same_v
<
AccDataType
,
int32_t
>
))
if
constexpr
(
!
(
is_same_v
<
AccDataType
,
float
>
||
is_same_v
<
AccDataType
,
int32_t
>
))
{
{
...
...
include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_wmma_cshuffle.hpp
View file @
91e611e8
...
@@ -579,7 +579,8 @@ struct DeviceGroupedConvFwdMultipleD_Wmma_CShuffle
...
@@ -579,7 +579,8 @@ struct DeviceGroupedConvFwdMultipleD_Wmma_CShuffle
namespace
ctc
=
tensor_layout
::
convolution
;
namespace
ctc
=
tensor_layout
::
convolution
;
// check device
// check device
if
(
get_device_name
()
==
"gfx1100"
)
if
(
get_device_name
()
==
"gfx1100"
||
get_device_name
()
==
"gfx1101"
||
ck
::
get_device_name
()
==
"gfx1102"
)
{
{
if
constexpr
(
!
(
is_same_v
<
AccDataType
,
float
>
||
is_same_v
<
AccDataType
,
int32_t
>
))
if
constexpr
(
!
(
is_same_v
<
AccDataType
,
float
>
||
is_same_v
<
AccDataType
,
int32_t
>
))
{
{
...
...
include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_wmma_cshuffle.hpp
View file @
91e611e8
...
@@ -54,7 +54,8 @@ __global__ void
...
@@ -54,7 +54,8 @@ __global__ void
const
Block2CTileMap
block_2_ctile_map
,
const
Block2CTileMap
block_2_ctile_map
,
const
ComputePtrOffsetOfBatch
compute_ptr_offset_of_batch
)
const
ComputePtrOffsetOfBatch
compute_ptr_offset_of_batch
)
{
{
#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx1100__))
#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx1100__) || defined(__gfx1101__) || \
defined(__gfx1102__))
// offset base pointer for each work-group
// offset base pointer for each work-group
const
index_t
num_blocks_per_batch
=
const
index_t
num_blocks_per_batch
=
__builtin_amdgcn_readfirstlane
(
get_grid_size
()
/
batch_count
);
__builtin_amdgcn_readfirstlane
(
get_grid_size
()
/
batch_count
);
...
@@ -147,7 +148,8 @@ __global__ void
...
@@ -147,7 +148,8 @@ __global__ void
const
ComputePtrOffsetOfBatch
compute_ptr_offset_of_batch
,
const
ComputePtrOffsetOfBatch
compute_ptr_offset_of_batch
,
const
Block2CTileMap
block_2_etile_map
)
const
Block2CTileMap
block_2_etile_map
)
{
{
#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx1100__))
#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx1100__) || defined(__gfx1101__) || \
defined(__gfx1102__))
// printf("entry kernel launch");
// printf("entry kernel launch");
__shared__
char
p_shared
[
GridwiseOp
::
GetSharedMemoryNumberOfByte
()];
__shared__
char
p_shared
[
GridwiseOp
::
GetSharedMemoryNumberOfByte
()];
...
@@ -242,7 +244,8 @@ __global__ void
...
@@ -242,7 +244,8 @@ __global__ void
const
CDEElementwiseOperation
cde_element_op
,
const
CDEElementwiseOperation
cde_element_op
,
const
Block2CTileMap
block_2_ctile_map
)
const
Block2CTileMap
block_2_ctile_map
)
{
{
#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx1100__))
#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx1100__) || defined(__gfx1101__) || \
defined(__gfx1102__))
__shared__
char
p_shared
[
GridwiseOp
::
GetSharedMemoryNumberOfByte
()];
__shared__
char
p_shared
[
GridwiseOp
::
GetSharedMemoryNumberOfByte
()];
GridwiseOp
::
template
Run
<
HasMainKBlockLoop
>(
p_a_grid
,
GridwiseOp
::
template
Run
<
HasMainKBlockLoop
>(
p_a_grid
,
...
@@ -271,7 +274,7 @@ __global__ void
...
@@ -271,7 +274,7 @@ __global__ void
ignore
=
b_element_op
;
ignore
=
b_element_op
;
ignore
=
cde_element_op
;
ignore
=
cde_element_op
;
ignore
=
block_2_ctile_map
;
ignore
=
block_2_ctile_map
;
#endif // end of if (defined(__gfx1100__))
#endif // end of if (defined(__gfx1100__
))
}
}
template
<
// DataType Family
template
<
// DataType Family
...
...
include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma.hpp
View file @
91e611e8
...
@@ -49,7 +49,8 @@ __global__ void
...
@@ -49,7 +49,8 @@ __global__ void
const
CElementwiseOperation
c_element_op
,
const
CElementwiseOperation
c_element_op
,
const
Block2CTileMap
block_2_ctile_map
)
const
Block2CTileMap
block_2_ctile_map
)
{
{
#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx1100__))
#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx1100__) || defined(__gfx1101__) || \
defined(__gfx1102__))
__shared__
char
p_shared
[
GridwiseGemm
::
GetSharedMemoryNumberOfByte
()];
__shared__
char
p_shared
[
GridwiseGemm
::
GetSharedMemoryNumberOfByte
()];
GridwiseGemm
::
template
Run
<
HasMainKBlockLoop
>(
p_a_grid
,
GridwiseGemm
::
template
Run
<
HasMainKBlockLoop
>(
p_a_grid
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment